Browse Source

1.0

tags/7.10.2
Indexea 3 years ago
parent
commit
49c72ef9c2
100 changed files with 17561 additions and 0 deletions
  1. +4
    -0
      .gitignore
  2. +78
    -0
      Readme.md
  3. +23
    -0
      core/Readme.md
  4. +296
    -0
      core/hanlp-part-of-speech-tagging.txt
  5. +33
    -0
      core/pom.xml
  6. +767
    -0
      core/src/main/java/com/hankcs/hanlp/HanLP.java
  7. +45
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ArrayCompare.java
  8. +67
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ArrayDistance.java
  9. +62
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/Dijkstra.java
  10. +232
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/EditDistance.java
  11. +65
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/LongestCommonSubsequence.java
  12. +102
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/LongestCommonSubstring.java
  13. +109
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/MaxHeap.java
  14. +271
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/Viterbi.java
  15. +100
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/Interval.java
  16. +216
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalNode.java
  17. +77
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalTree.java
  18. +26
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/Intervalable.java
  19. +15
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalableComparatorByPosition.java
  20. +20
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalableComparatorBySize.java
  21. +42
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/Emit.java
  22. +22
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/FragmentToken.java
  23. +29
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/MatchToken.java
  24. +190
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/State.java
  25. +32
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/Token.java
  26. +335
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/Trie.java
  27. +37
    -0
      core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/TrieConfig.java
  28. +152
    -0
      core/src/main/java/com/hankcs/hanlp/classification/classifiers/AbstractClassifier.java
  29. +145
    -0
      core/src/main/java/com/hankcs/hanlp/classification/classifiers/IClassifier.java
  30. +205
    -0
      core/src/main/java/com/hankcs/hanlp/classification/classifiers/NaiveBayesClassifier.java
  31. +39
    -0
      core/src/main/java/com/hankcs/hanlp/classification/collections/FrequencyMap.java
  32. +170
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/AbstractDataSet.java
  33. +50
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/BagOfWordsDocument.java
  34. +95
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/Catalog.java
  35. +117
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/Document.java
  36. +184
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/FileDataSet.java
  37. +146
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/IDataSet.java
  38. +22
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/ITermFrequencyHolder.java
  39. +88
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/Lexicon.java
  40. +89
    -0
      core/src/main/java/com/hankcs/hanlp/classification/corpus/MemoryDataSet.java
  41. +58
    -0
      core/src/main/java/com/hankcs/hanlp/classification/features/BaseFeatureData.java
  42. +148
    -0
      core/src/main/java/com/hankcs/hanlp/classification/features/ChiSquareFeatureExtractor.java
  43. +32
    -0
      core/src/main/java/com/hankcs/hanlp/classification/features/DfFeatureData.java
  44. +17
    -0
      core/src/main/java/com/hankcs/hanlp/classification/features/IFeatureWeighter.java
  45. +22
    -0
      core/src/main/java/com/hankcs/hanlp/classification/features/TfIdfFeatureWeighter.java
  46. +12
    -0
      core/src/main/java/com/hankcs/hanlp/classification/features/TfOnlyFeatureWeighter.java
  47. +37
    -0
      core/src/main/java/com/hankcs/hanlp/classification/models/AbstractModel.java
  48. +34
    -0
      core/src/main/java/com/hankcs/hanlp/classification/models/NaiveBayesModel.java
  49. +257
    -0
      core/src/main/java/com/hankcs/hanlp/classification/statistics/ContinuousDistributions.java
  50. +110
    -0
      core/src/main/java/com/hankcs/hanlp/classification/statistics/evaluations/Evaluator.java
  51. +100
    -0
      core/src/main/java/com/hankcs/hanlp/classification/statistics/evaluations/FMeasure.java
  52. +77
    -0
      core/src/main/java/com/hankcs/hanlp/classification/tokenizers/BigramTokenizer.java
  53. +24
    -0
      core/src/main/java/com/hankcs/hanlp/classification/tokenizers/BlankTokenizer.java
  54. +46
    -0
      core/src/main/java/com/hankcs/hanlp/classification/tokenizers/HanLPTokenizer.java
  55. +22
    -0
      core/src/main/java/com/hankcs/hanlp/classification/tokenizers/ITokenizer.java
  56. +99
    -0
      core/src/main/java/com/hankcs/hanlp/classification/utilities/CollectionUtility.java
  57. +150
    -0
      core/src/main/java/com/hankcs/hanlp/classification/utilities/TextProcessUtility.java
  58. +47
    -0
      core/src/main/java/com/hankcs/hanlp/classification/utilities/io/ConsoleLogger.java
  59. +24
    -0
      core/src/main/java/com/hankcs/hanlp/classification/utilities/io/ILogger.java
  60. +1018
    -0
      core/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/AhoCorasickDoubleArrayTrie.java
  61. +237
    -0
      core/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/State.java
  62. +1092
    -0
      core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAG.java
  63. +211
    -0
      core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAGMap.java
  64. +548
    -0
      core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAGNode.java
  65. +165
    -0
      core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAGSet.java
  66. +312
    -0
      core/src/main/java/com/hankcs/hanlp/collection/MDAG/SimpleMDAGNode.java
  67. +198
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/DartMap.java
  68. +224
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/DoubleArray.java
  69. +47
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/Pair.java
  70. +176
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/AutoBytePool.java
  71. +118
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/AutoIntPool.java
  72. +148
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/BitVector.java
  73. +492
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/DawgBuilder.java
  74. +525
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/DoubleArrayBuilder.java
  75. +89
    -0
      core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/Keyset.java
  76. +152
    -0
      core/src/main/java/com/hankcs/hanlp/collection/sequence/SString.java
  77. +29
    -0
      core/src/main/java/com/hankcs/hanlp/collection/set/UnEmptyStringSet.java
  78. +1475
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrie.java
  79. +33
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/ITrie.java
  80. +304
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java
  81. +668
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BinTrie.java
  82. +103
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/Node.java
  83. +27
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/_EmptyValueArray.java
  84. +44
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/_ValueArray.java
  85. +73
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/util/ArrayTool.java
  86. +19
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/CharacterMapping.java
  87. +220
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/IntArrayList.java
  88. +428
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/MutableDoubleArrayTrie.java
  89. +1385
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/MutableDoubleArrayTrieInteger.java
  90. +119
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/Utf8CharacterMapping.java
  91. +14
    -0
      core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/package-info.java
  92. +43
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLFixer.java
  93. +41
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLLoader.java
  94. +164
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLSentence.java
  95. +125
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLWord.java
  96. +51
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNllLine.java
  97. +90
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/Evaluator.java
  98. +63
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/PosTagCompiler.java
  99. +97
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/model/MaxEntDependencyModelMaker.java
  100. +81
    -0
      core/src/main/java/com/hankcs/hanlp/corpus/dependency/model/WordNatureWeightModelMaker.java

+ 4
- 0
.gitignore View File

@@ -0,0 +1,4 @@
/.idea/
/*/target/
.DS_Store
*.iml

+ 78
- 0
Readme.md View File

@@ -0,0 +1,78 @@
## ideaseg

`ideaseg` 是一个基于最新的 [HanLP](https://github.com/hankcs/HanLP/tree/1.x) 自然语言处理工具包实现的中文分词器,
包含了最新的模型数据,同时移除了 HanLP 所包含的非商业友好许可的 [NeuralNetworkParser](https://github.com/hankcs/HanLP/issues/644) 相关代码和数据。

`HanLP` 相比其他诸如 `IK`、`jcseg` 等分词器而言,在分词的准确率上有巨大的提升,但速度上有所牺牲。
通过对 `HanLP` 进行优化配置,`ideaseg` 在准确度和分词速度上取得了最佳的平衡。

与其他基于 `HanLP` 的插件相比,`ideaseg` 同步了最新 `HanLP` 的代码和数据,去除了无法商用的相关内容;实现了自动配置;
包含了模型数据,无需自行下载,使用简单方便。

`ideaseg` 提供三个模块包括:

1. `core` ~ 核心分词器模块
2. `elasticsearch` ~ ElasticSearch 的 ideaseg 分词插件 (默认版本 7.10.2)
3. `opensearch` ~ OpenSearch 的 ideaseg 分词插件 (默认版本 2.4.1)

此外 `data` 包含 `HanLP` 的模型数据。

由于 `ElasticSearch` 的插件机制严格绑定引擎本身的版本,而且版本众多,因此本项目不提供预编译的二进制版本,你需要执行下载源码进行构建。

### 构建

以下是插件的构建过程,在开始之前请先安装好 `git`、`java`、`maven` 等相关工具。

首先确定你的 `ElasticSearch` 或者 `OpenSearch` 的具体版本,假设你使用的是 `ElasticSearch` 7.10.2 版本,
请使用文本编辑器打开 `ideaseg/elasticsearch/pom.xml` 文件,修改 `elasticsearch.version` 对应的值为 `7.10.2`
(如果是 `OpenSearch` 请修改 `opensearch/pom.xml`)。

保存文件并打开命令行窗口,执行如下命令开始构建:

```shell
$ git clone https://gitee.com/indexea/ideaseg
$ cd ideaseg
$ mvn install
```

构建完成后,将在 `elasticsearch/target` 和 `opensearch/target` 各生成两个插件文件为 `ideaseg.zip` 。

### 安装

构建完成后,我们可以利用 `ElasticSearch` 或 `OpenSearch` 提供的插件管理工具进行安装。

`ElasticSearch` 对应的插件管理工具为 `<elasticsearch>/bin/elasticsearch-plugin` ,
而 `OpenSearch` 对应的管理工具为 `<opensearch>/bin/opensearch-plugin`。
其中 `<elasticsearch>` 和 `<opensearch>` 为两个服务安装后所在的目录。

#### ElasticSearch 安装 ideaseg 插件

```shell
$ bin/elasticsearch-plugin install file:///<ideaseg>/elasticsearch/target/ideaseg.zip
```

#### OpenSearch 安装 ideaseg 插件

```shell
$ bin/opensearch-plugin install file:///<ideaseg>/opensearch/target/ideaseg.zip
```

其中 `<ideaseg>` 为 `ideaseg` 源码所在的路径。要特别注意到是路径前必须有 `file://` 。

安装过程会询问插件所需的权限,回车确认即可完成安装,安装完毕需要重启服务才能让插件生效。

接下来你可以使用分词测试工具来对插件进行测试,如下所示:

```
POST _analyze
{
"analyzer": "ideaseg",
"text": "你好,我用的是 ideaseg 分词插件。"
}
```

关于分词测试的详情请参考 [ElasticSearch 官方文档](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/test-analyzer.html)。

### 反馈问题

如果你在使用 `ideaseg` 过程中有任何问题,请通过 [Issues](https://gitee.com/indexea/ideaseg/issues) 提出。

+ 23
- 0
core/Readme.md View File

@@ -0,0 +1,23 @@
## ideaseg 分词核心模块

根据实际的比较结果来看,成熟可用的中文分词模块有如下几个:

* **HanLP (最佳选择)**
基于深度学习和海量语料训练的自然语言处理,代码结构差但是可用。性能相比 jcseg 可能有10倍差距(没细测) 但是准确度非常高,分词效果非常好。
该项目最新版本 1.8.2 ,以后基本上不会再更新了,官方以前全面转向云服务。
* jcseg
速度快,但是分词定位错误问题始终没有解决,不过解决思路已经有了。暂时不予考虑。
* CoreNLP
非常专业的自然语言处理能力,但是体积大、速度慢,无法在生产中使用, 日后可以考虑使用它为一些高净值客户提供专属服务
* ikanalyzer
比较老而且成熟的基于词库分词,多年未更新,有志愿者一直提供 ES 插件的更新。

经过多次比较最终确定第一版基于 **HanLP** 实现分词功能。

**下一步需要做的包括:**

1. 提供多租户的分词个性化定制能力
2. 优化分词性能
3. 完善 HanLP 的代码结构
4. 提供多种分词模型
5. 研究如何更新语料库,生成新的语料模型

+ 296
- 0
core/hanlp-part-of-speech-tagging.txt View File

@@ -0,0 +1,296 @@
a
形容词
ad
副形词
ag
形容词性语素
al
形容词性惯用语
an
名形词
b
区别词
begin
仅用于始##始
bg
区别语素
bl
区别词性惯用语
c
连词
cc
并列连词
d
副词
dg
辄,俱,复之类的副词
dl
连语
e
叹词
end
仅用于终##终
f
方位词
g
学术词汇
gb
生物相关词汇
gbc
生物类别
gc
化学相关词汇
gg
地理地质相关词汇
gi
计算机相关词汇
gm
数学相关词汇
gp
物理相关词汇
h
前缀
i
成语
j
简称略语
k
后缀
l
习用语
m
数词
mg
数语素
Mg
甲乙丙丁之类的数词
mq
数量词
n
名词
nb
生物名
nba
动物名
nbc
动物纲目
nbp
植物名
nf
食品,比如“薯片”
ng
名词性语素
nh
医药疾病等健康相关名词
nhd
疾病
nhm
药品
ni
机构相关(不是独立机构名)
nic
下属机构
nis
机构后缀
nit
教育相关机构
nl
名词性惯用语
nm
物品名
nmc
化学品名
nn
工作相关名词
nnd
职业
nnt
职务职称
nr
人名
nr1
复姓
nr2
蒙古姓名
nrf
音译人名
nrj
日语人名
ns
地名
nsf
音译地名
nt
机构团体名
ntc
公司名
ntcb
银行
ntcf
工厂
ntch
酒店宾馆
nth
医院
nto
政府机构
nts
中小学
ntu
大学
nx
字母专名
nz
其他专名
o
拟声词
p
介词
pba
介词“把”
pbei
介词“被”
q
量词
qg
量词语素
qt
时量词
qv
动量词
r
代词
rg
代词性语素
Rg
古汉语代词性语素
rr
人称代词
ry
疑问代词
rys
处所疑问代词
ryt
时间疑问代词
ryv
谓词性疑问代词
rz
指示代词
rzs
处所指示代词
rzt
时间指示代词
rzv
谓词性指示代词
s
处所词
t
时间词
tg
时间词性语素
u
助词
ud
助词
ude1
的 底
ude2
ude3
udeng
等 等等 云云
udh
的话
ug
uguo
uj
助词
ul
连词
ule
了 喽
ulian
连 (“连小学生都会”)
uls
来讲 来说 而言 说来
usuo
uv
连词
uyy
一样 一般 似的 般
uz
uzhe
uzhi
v
动词
vd
副动词
vf
趋向动词
vg
动词性语素
vi
不及物动词(内动词)
vl
动词性惯用语
vn
名动词
vshi
动词“是”
vx
形式动词
vyou
动词“有”
w
标点符号
wb
百分号千分号,全角:% ‰ 半角:%
wd
逗号,全角:, 半角:,
wf
分号,全角:; 半角: ;
wh
单位符号,全角:¥ $ £ ° ℃ 半角:$
wj
句号,全角:。
wky
右括号,全角:) 〕 ] } 》 】 〗 〉 半角: ) ] { >
wkz
左括号,全角:( 〔 [ { 《 【 〖 〈 半角:( [ { <
wm
冒号,全角:: 半角: :
wn
顿号,全角:、
wp
破折号,全角:—— -- ——- 半角:— —-
ws
省略号,全角:…… …
wt
叹号,全角:!
ww
问号,全角:?
wyy
右引号,全角:” ’ 』
wyz
左引号,全角:“ ‘ 『
x
字符串
xu
网址URL
xx
非语素字
y
语气词(delete yg)
yg
语气语素
z
状态词
zg
状态词

+ 33
- 0
core/pom.xml View File

@@ -0,0 +1,33 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>ideaseg</artifactId>
<groupId>com.indexea.ideaseg</groupId>
<version>${revision}</version>
</parent>
<modelVersion>4.0.0</modelVersion>

<groupId>com.indexea.ideaseg</groupId>
<artifactId>ideaseg-core</artifactId>

<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-codecs</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
</dependencies>

</project>

+ 767
- 0
core/src/main/java/com/hankcs/hanlp/HanLP.java View File

@@ -0,0 +1,767 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/10/17 19:02</create-date>
*
* <copyright file="HanLP.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp;

import com.hankcs.hanlp.corpus.io.IIOAdapter;
import com.hankcs.hanlp.dependency.perceptron.parser.KBeamArcEagerDependencyParser;
import com.hankcs.hanlp.dictionary.py.Pinyin;
import com.hankcs.hanlp.dictionary.py.PinyinDictionary;
import com.hankcs.hanlp.dictionary.ts.*;
import com.hankcs.hanlp.mining.phrase.IPhraseExtractor;
import com.hankcs.hanlp.mining.phrase.MutualInformationEntropyPhraseExtractor;
import com.hankcs.hanlp.mining.word.NewWordDiscover;
import com.hankcs.hanlp.mining.word.WordInfo;
import com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer;
import com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer;
import com.hankcs.hanlp.seg.NShort.NShortSegment;
import com.hankcs.hanlp.seg.Other.DoubleArrayTrieSegment;
import com.hankcs.hanlp.seg.Segment;
import com.hankcs.hanlp.seg.Viterbi.ViterbiSegment;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.summary.TextRankKeyword;
import com.hankcs.hanlp.summary.TextRankSentence;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import com.hankcs.hanlp.utility.Predefine;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.*;
import java.lang.reflect.Constructor;
import java.util.List;
import java.util.Properties;
import java.util.logging.Level;

import static com.hankcs.hanlp.utility.Predefine.logger;

/**
* HanLP: Han Language Processing <br>
* 汉语言处理包 <br>
* 常用接口工具类
*
* @author hankcs
*/
public class HanLP {
/**
* 库的全局配置,既可以用代码修改,也可以通过hanlp.properties配置(按照 变量名=值 的形式)
*/
public static final class Config {
/**
* 开发模式
*/
public static boolean DEBUG = false;
/**
* 核心词典路径
*/
public static String CoreDictionaryPath = "data/dictionary/CoreNatureDictionary.txt";
/**
* 核心词典词性转移矩阵路径
*/
public static String CoreDictionaryTransformMatrixDictionaryPath = "data/dictionary/CoreNatureDictionary.tr.txt";
/**
* 用户自定义词典路径
*/
public static String CustomDictionaryPath[] = new String[]{"data/dictionary/custom/CustomDictionary.txt"};
/**
* 用户自定义词典是否自动重新生成缓存(根据词典文件的最后修改时间是否大于缓存文件的时间判断)
*/
public static boolean CustomDictionaryAutoRefreshCache = true;
/**
* 2元语法词典路径
*/
public static String BiGramDictionaryPath = "data/dictionary/CoreNatureDictionary.ngram.txt";

/**
* 停用词词典路径
*/
public static String CoreStopWordDictionaryPath = "data/dictionary/stopwords.txt";
/**
* 同义词词典路径
*/
public static String CoreSynonymDictionaryDictionaryPath = "data/dictionary/synonym/CoreSynonym.txt";
/**
* 人名词典路径
*/
public static String PersonDictionaryPath = "data/dictionary/person/nr.txt";
/**
* 人名词典转移矩阵路径
*/
public static String PersonDictionaryTrPath = "data/dictionary/person/nr.tr.txt";
/**
* 地名词典路径
*/
public static String PlaceDictionaryPath = "data/dictionary/place/ns.txt";
/**
* 地名词典转移矩阵路径
*/
public static String PlaceDictionaryTrPath = "data/dictionary/place/ns.tr.txt";
/**
* 地名词典路径
*/
public static String OrganizationDictionaryPath = "data/dictionary/organization/nt.txt";
/**
* 地名词典转移矩阵路径
*/
public static String OrganizationDictionaryTrPath = "data/dictionary/organization/nt.tr.txt";
/**
* 简繁转换词典根目录
*/
public static String tcDictionaryRoot = "data/dictionary/tc/";

/**
* 拼音词典路径
*/
public static String PinyinDictionaryPath = "data/dictionary/pinyin/pinyin.txt";

/**
* 音译人名词典
*/
public static String TranslatedPersonDictionaryPath = "data/dictionary/person/nrf.txt";

/**
* 日本人名词典路径
*/
public static String JapanesePersonDictionaryPath = "data/dictionary/person/nrj.txt";

/**
* 字符类型对应表
*/
public static String CharTypePath = "data/dictionary/other/CharType.bin";

/**
* 字符正规化表(全角转半角,繁体转简体)
*/
public static String CharTablePath = "data/dictionary/other/CharTable.txt";

/**
* 词性标注集描述表,用来进行中英映射(对于Nature词性,可直接参考Nature.java中的注释)
*/
public static String PartOfSpeechTagDictionary = "data/dictionary/other/TagPKU98.csv";

/**
* 词-词性-依存关系模型
*/
public static String WordNatureModelPath = "data/model/dependency/WordNature.txt";

/**
* 最大熵-依存关系模型
*
* @deprecated 已废弃,请使用{@link KBeamArcEagerDependencyParser}。未来版本将不再发布该模型,并删除配置项
*/
public static String MaxEntModelPath = "data/model/dependency/MaxEntModel.txt";
/**
* 神经网络依存模型路径
*/
public static String NNParserModelPath = "data/model/dependency/NNParserModel.txt";
/**
* 感知机ArcEager依存模型路径
*/
public static String PerceptronParserModelPath = "data/model/dependency/perceptron.bin";
/**
* CRF分词模型
*
* @deprecated 已废弃,请使用{@link com.hankcs.hanlp.model.crf.CRFLexicalAnalyzer}。未来版本将不再发布该模型,并删除配置项
*/
public static String CRFSegmentModelPath = "data/model/segment/CRFSegmentModel.txt";
/**
* HMM分词模型
*
* @deprecated 已废弃,请使用{@link PerceptronLexicalAnalyzer}
*/
public static String HMMSegmentModelPath = "data/model/segment/HMMSegmentModel.bin";
/**
* CRF分词模型
*/
public static String CRFCWSModelPath = "data/model/crf/pku199801/cws.txt";
/**
* CRF词性标注模型
*/
public static String CRFPOSModelPath = "data/model/crf/pku199801/pos.txt";
/**
* CRF命名实体识别模型
*/
public static String CRFNERModelPath = "data/model/crf/pku199801/ner.txt";
/**
* 感知机分词模型
*/
public static String PerceptronCWSModelPath = "data/model/perceptron/large/cws.bin";
/**
* 感知机词性标注模型
*/
public static String PerceptronPOSModelPath = "data/model/perceptron/pku1998/pos.bin";
/**
* 感知机命名实体识别模型
*/
public static String PerceptronNERModelPath = "data/model/perceptron/pku1998/ner.bin";
/**
* 分词结果是否展示词性
*/
public static boolean ShowTermNature = true;
/**
* 是否执行字符正规化(繁体->简体,全角->半角,大写->小写),切换配置后必须删CustomDictionary.txt.bin缓存
*/
public static boolean Normalization = false;
/**
* IO适配器(默认null,表示从本地文件系统读取),实现com.hankcs.hanlp.corpus.io.IIOAdapter接口
* 以在不同的平台(Hadoop、Redis等)上运行HanLP
*/
public static IIOAdapter IOAdapter;

static {
// 自动读取配置
Properties p = new Properties();
try {
ClassLoader loader = Thread.currentThread().getContextClassLoader();
if (loader == null) { // IKVM (v.0.44.0.5) doesn't set context classloader
loader = HanLP.Config.class.getClassLoader();
}
try {
p.load(new InputStreamReader(Predefine.HANLP_PROPERTIES_PATH == null ?
loader.getResourceAsStream("hanlp.properties") :
new FileInputStream(Predefine.HANLP_PROPERTIES_PATH)
, "UTF-8"));
} catch (Exception e) {
String HANLP_ROOT = System.getProperty("HANLP_ROOT");
if (HANLP_ROOT == null) HANLP_ROOT = System.getenv("HANLP_ROOT");
if (HANLP_ROOT != null) {
HANLP_ROOT = HANLP_ROOT.trim();
p = new Properties();
p.setProperty("root", HANLP_ROOT);
logger.info("使用环境变量 HANLP_ROOT=" + HANLP_ROOT);
} else throw e;
}
String root = p.getProperty("root", "").replaceAll("\\\\", "/");
if (root.length() > 0 && !root.endsWith("/")) root += "/";
CoreDictionaryPath = root + p.getProperty("CoreDictionaryPath", CoreDictionaryPath);
CoreDictionaryTransformMatrixDictionaryPath = root + p.getProperty("CoreDictionaryTransformMatrixDictionaryPath", CoreDictionaryTransformMatrixDictionaryPath);
BiGramDictionaryPath = root + p.getProperty("BiGramDictionaryPath", BiGramDictionaryPath);
CoreStopWordDictionaryPath = root + p.getProperty("CoreStopWordDictionaryPath", CoreStopWordDictionaryPath);
CoreSynonymDictionaryDictionaryPath = root + p.getProperty("CoreSynonymDictionaryDictionaryPath", CoreSynonymDictionaryDictionaryPath);
PersonDictionaryPath = root + p.getProperty("PersonDictionaryPath", PersonDictionaryPath);
PersonDictionaryTrPath = root + p.getProperty("PersonDictionaryTrPath", PersonDictionaryTrPath);
String[] pathArray = p.getProperty("CustomDictionaryPath", "data/dictionary/custom/CustomDictionary.txt").split(";");
String prePath = root;
for (int i = 0; i < pathArray.length; ++i) {
if (pathArray[i].startsWith(" ")) {
pathArray[i] = prePath + pathArray[i].trim();
} else {
pathArray[i] = root + pathArray[i];
int lastSplash = pathArray[i].lastIndexOf('/');
if (lastSplash != -1) {
prePath = pathArray[i].substring(0, lastSplash + 1);
}
}
}
CustomDictionaryPath = pathArray;
CustomDictionaryAutoRefreshCache = "true".equals(p.getProperty("CustomDictionaryAutoRefreshCache", "true"));
tcDictionaryRoot = root + p.getProperty("tcDictionaryRoot", tcDictionaryRoot);
if (!tcDictionaryRoot.endsWith("/")) tcDictionaryRoot += '/';
PinyinDictionaryPath = root + p.getProperty("PinyinDictionaryPath", PinyinDictionaryPath);
TranslatedPersonDictionaryPath = root + p.getProperty("TranslatedPersonDictionaryPath", TranslatedPersonDictionaryPath);
JapanesePersonDictionaryPath = root + p.getProperty("JapanesePersonDictionaryPath", JapanesePersonDictionaryPath);
PlaceDictionaryPath = root + p.getProperty("PlaceDictionaryPath", PlaceDictionaryPath);
PlaceDictionaryTrPath = root + p.getProperty("PlaceDictionaryTrPath", PlaceDictionaryTrPath);
OrganizationDictionaryPath = root + p.getProperty("OrganizationDictionaryPath", OrganizationDictionaryPath);
OrganizationDictionaryTrPath = root + p.getProperty("OrganizationDictionaryTrPath", OrganizationDictionaryTrPath);
CharTypePath = root + p.getProperty("CharTypePath", CharTypePath);
CharTablePath = root + p.getProperty("CharTablePath", CharTablePath);
PartOfSpeechTagDictionary = root + p.getProperty("PartOfSpeechTagDictionary", PartOfSpeechTagDictionary);
WordNatureModelPath = root + p.getProperty("WordNatureModelPath", WordNatureModelPath);
MaxEntModelPath = root + p.getProperty("MaxEntModelPath", MaxEntModelPath);
NNParserModelPath = root + p.getProperty("NNParserModelPath", NNParserModelPath);
PerceptronParserModelPath = root + p.getProperty("PerceptronParserModelPath", PerceptronParserModelPath);
CRFSegmentModelPath = root + p.getProperty("CRFSegmentModelPath", CRFSegmentModelPath);
HMMSegmentModelPath = root + p.getProperty("HMMSegmentModelPath", HMMSegmentModelPath);
CRFCWSModelPath = root + p.getProperty("CRFCWSModelPath", CRFCWSModelPath);
CRFPOSModelPath = root + p.getProperty("CRFPOSModelPath", CRFPOSModelPath);
CRFNERModelPath = root + p.getProperty("CRFNERModelPath", CRFNERModelPath);
PerceptronCWSModelPath = root + p.getProperty("PerceptronCWSModelPath", PerceptronCWSModelPath);
PerceptronPOSModelPath = root + p.getProperty("PerceptronPOSModelPath", PerceptronPOSModelPath);
PerceptronNERModelPath = root + p.getProperty("PerceptronNERModelPath", PerceptronNERModelPath);
ShowTermNature = "true".equals(p.getProperty("ShowTermNature", "true"));
Normalization = "true".equals(p.getProperty("Normalization", "false"));
String ioAdapterClassName = p.getProperty("IOAdapter");
if (ioAdapterClassName != null) {
try {
Class<?> clazz = Class.forName(ioAdapterClassName);
Constructor<?> ctor = clazz.getConstructor();
Object instance = ctor.newInstance();
if (instance != null) IOAdapter = (IIOAdapter) instance;
} catch (ClassNotFoundException e) {
logger.warning(String.format("找不到IO适配器类: %s ,请检查第三方插件jar包", ioAdapterClassName));
} catch (NoSuchMethodException e) {
logger.warning(String.format("工厂类[%s]没有默认构造方法,不符合要求", ioAdapterClassName));
} catch (SecurityException e) {
logger.warning(String.format("工厂类[%s]默认构造方法无法访问,不符合要求", ioAdapterClassName));
} catch (Exception e) {
logger.warning(String.format("工厂类[%s]构造失败:%s\n", ioAdapterClassName, TextUtility.exceptionToString(e)));
}
}
} catch (Exception e) {
if (new File("data/dictionary/CoreNatureDictionary.tr.txt").isFile()) {
logger.info("使用当前目录下的data");
} else {
StringBuilder sbInfo = new StringBuilder("========Tips========\n请将hanlp.properties放在下列目录:\n"); // 打印一些友好的tips
if (new File("src/main/java").isDirectory()) {
sbInfo.append("src/main/resources");
} else {
String classPath = (String) System.getProperties().get("java.class.path");
if (classPath != null) {
for (String path : classPath.split(File.pathSeparator)) {
if (new File(path).isDirectory()) {
sbInfo.append(path).append('\n');
}
}
}
sbInfo.append("Web项目则请放到下列目录:\n" +
"Webapp/WEB-INF/lib\n" +
"Webapp/WEB-INF/classes\n" +
"Appserver/lib\n" +
"JRE/lib\n");
sbInfo.append("并且编辑root=PARENT/path/to/your/data\n");
sbInfo.append("现在HanLP将尝试从").append(System.getProperties().get("user.dir")).append("读取data……");
}
logger.severe("没有找到hanlp.properties,可能会导致找不到data\n" + sbInfo);
}
}
}

/**
* 开启调试模式(会降低性能)
*/
public static void enableDebug() {
enableDebug(true);
}

/**
* 开启调试模式(会降低性能)
*
* @param enable
*/
public static void enableDebug(boolean enable) {
DEBUG = enable;
if (DEBUG) {
logger.setLevel(Level.ALL);
} else {
logger.setLevel(Level.OFF);
}
}
}

/**
* 工具类,不需要生成实例
*/
private HanLP() {
}

/**
* 繁转简
*
* @param traditionalChineseString 繁体中文
* @return 简体中文
*/
public static String convertToSimplifiedChinese(String traditionalChineseString) {
return TraditionalChineseDictionary.convertToSimplifiedChinese(traditionalChineseString.toCharArray());
}

/**
* 简转繁
*
* @param simplifiedChineseString 简体中文
* @return 繁体中文
*/
public static String convertToTraditionalChinese(String simplifiedChineseString) {
return SimplifiedChineseDictionary.convertToTraditionalChinese(simplifiedChineseString.toCharArray());
}

/**
* 简转繁,是{@link com.hankcs.hanlp.HanLP#convertToTraditionalChinese(java.lang.String)}的简称
*
* @param s 简体中文
* @return 繁体中文(大陆标准)
*/
public static String s2t(String s) {
return HanLP.convertToTraditionalChinese(s);
}

/**
* 繁转简,是{@link HanLP#convertToSimplifiedChinese(String)}的简称
*
* @param t 繁体中文(大陆标准)
* @return 简体中文
*/
public static String t2s(String t) {
return HanLP.convertToSimplifiedChinese(t);
}

/**
* 簡體到臺灣正體
*
* @param s 簡體
* @return 臺灣正體
*/
public static String s2tw(String s) {
return SimplifiedToTaiwanChineseDictionary.convertToTraditionalTaiwanChinese(s);
}

/**
* 臺灣正體到簡體
*
* @param tw 臺灣正體
* @return 簡體
*/
public static String tw2s(String tw) {
return TaiwanToSimplifiedChineseDictionary.convertToSimplifiedChinese(tw);
}

/**
* 簡體到香港繁體
*
* @param s 簡體
* @return 香港繁體
*/
public static String s2hk(String s) {
return SimplifiedToHongKongChineseDictionary.convertToTraditionalHongKongChinese(s);
}

/**
* 香港繁體到簡體
*
* @param hk 香港繁體
* @return 簡體
*/
public static String hk2s(String hk) {
return HongKongToSimplifiedChineseDictionary.convertToSimplifiedChinese(hk);
}

/**
* 繁體到臺灣正體
*
* @param t 繁體
* @return 臺灣正體
*/
public static String t2tw(String t) {
return TraditionalToTaiwanChineseDictionary.convertToTaiwanChinese(t);
}

/**
* 臺灣正體到繁體
*
* @param tw 臺灣正體
* @return 繁體
*/
public static String tw2t(String tw) {
return TaiwanToTraditionalChineseDictionary.convertToTraditionalChinese(tw);
}

/**
* 繁體到香港繁體
*
* @param t 繁體
* @return 香港繁體
*/
public static String t2hk(String t) {
return TraditionalToHongKongChineseDictionary.convertToHongKongTraditionalChinese(t);
}

/**
* 香港繁體到繁體
*
* @param hk 香港繁體
* @return 繁體
*/
public static String hk2t(String hk) {
return HongKongToTraditionalChineseDictionary.convertToTraditionalChinese(hk);
}

/**
* 香港繁體到臺灣正體
*
* @param hk 香港繁體
* @return 臺灣正體
*/
public static String hk2tw(String hk) {
return HongKongToTaiwanChineseDictionary.convertToTraditionalTaiwanChinese(hk);
}

/**
* 臺灣正體到香港繁體
*
* @param tw 臺灣正體
* @return 香港繁體
*/
public static String tw2hk(String tw) {
return TaiwanToHongKongChineseDictionary.convertToTraditionalHongKongChinese(tw);
}

/**
* 转化为拼音
*
* @param text 文本
* @param separator 分隔符
* @param remainNone 有些字没有拼音(如标点),是否保留它们的拼音(true用none表示,false用原字符表示)
* @return 一个字符串,由[拼音][分隔符][拼音]构成
*/
public static String convertToPinyinString(String text, String separator, boolean remainNone) {
List<Pinyin> pinyinList = PinyinDictionary.convertToPinyin(text, true);
int length = pinyinList.size();
StringBuilder sb = new StringBuilder(length * (5 + separator.length()));
int i = 1;
for (Pinyin pinyin : pinyinList) {

if (pinyin == Pinyin.none5 && !remainNone) {
sb.append(text.charAt(i - 1));
} else sb.append(pinyin.getPinyinWithoutTone());
if (i < length) {
sb.append(separator);
}
++i;
}
return sb.toString();
}

/**
* 转化为拼音
*
* @param text 待解析的文本
* @return 一个拼音列表
*/
public static List<Pinyin> convertToPinyinList(String text) {
return PinyinDictionary.convertToPinyin(text);
}

/**
* 转化为拼音(首字母)
*
* @param text 文本
* @param separator 分隔符
* @param remainNone 有些字没有拼音(如标点),是否保留它们(用none表示)
* @return 一个字符串,由[首字母][分隔符][首字母]构成
*/
public static String convertToPinyinFirstCharString(String text, String separator, boolean remainNone) {
List<Pinyin> pinyinList = PinyinDictionary.convertToPinyin(text, remainNone);
int length = pinyinList.size();
StringBuilder sb = new StringBuilder(length * (1 + separator.length()));
int i = 1;
for (Pinyin pinyin : pinyinList) {
sb.append(pinyin.getFirstChar());
if (i < length) {
sb.append(separator);
}
++i;
}
return sb.toString();
}

/**
* 分词
*
* @param text 文本
* @return 切分后的单词
*/
public static List<Term> segment(String text) {
return StandardTokenizer.segment(text.toCharArray());
}

/**
* 创建一个分词器<br>
* 这是一个工厂方法<br>
* 与直接new一个分词器相比,使用本方法的好处是,以后HanLP升级了,总能用上最合适的分词器
*
* @return 一个分词器
*/
public static Segment newSegment() {
return new ViterbiSegment(); // Viterbi分词器是目前效率和效果的最佳平衡
}

/**
* 创建一个分词器,
* 这是一个工厂方法<br>
*
* @param algorithm 分词算法,传入算法的中英文名都可以,可选列表:<br>
* <ul>
* <li>维特比 (viterbi):效率和效果的最佳平衡</li>
* <li>双数组trie树 (dat):极速词典分词,千万字符每秒</li>
* <li>条件随机场 (crf):分词、词性标注与命名实体识别精度都较高,适合要求较高的NLP任务</li>
* <li>感知机 (perceptron):分词、词性标注与命名实体识别,支持在线学习</li>
* <li>N最短路 (nshort):命名实体识别稍微好一些,牺牲了速度</li>
* </ul>
* @return 一个分词器
*/
public static Segment newSegment(String algorithm) {
if (algorithm == null) {
throw new IllegalArgumentException(String.format("非法参数 algorithm == %s", algorithm));
}
algorithm = algorithm.toLowerCase();
if ("viterbi".equals(algorithm) || "维特比".equals(algorithm))
return new ViterbiSegment(); // Viterbi分词器是目前效率和效果的最佳平衡
else if ("dat".equals(algorithm) || "双数组trie树".equals(algorithm))
return new DoubleArrayTrieSegment();
else if ("nshort".equals(algorithm) || "n最短路".equals(algorithm))
return new NShortSegment();
else if ("crf".equals(algorithm) || "条件随机场".equals(algorithm))
try {
return new CRFLexicalAnalyzer();
} catch (IOException e) {
logger.warning("CRF模型加载失败");
throw new RuntimeException(e);
}
else if ("perceptron".equals(algorithm) || "感知机".equals(algorithm)) {
try {
return new PerceptronLexicalAnalyzer();
} catch (IOException e) {
logger.warning("感知机模型加载失败");
throw new RuntimeException(e);
}
}
throw new IllegalArgumentException(String.format("非法参数 algorithm == %s", algorithm));
}

/**
* 提取短语
*
* @param text 文本
* @param size 需要多少个短语
* @return 一个短语列表,大小 <= size
*/
public static List<String> extractPhrase(String text, int size) {
IPhraseExtractor extractor = new MutualInformationEntropyPhraseExtractor();
return extractor.extractPhrase(text, size);
}

/**
* 提取词语
*
* @param text 大文本
* @param size 需要提取词语的数量
* @return 一个词语列表
*/
public static List<WordInfo> extractWords(String text, int size) {
return extractWords(text, size, false);
}

/**
* 提取词语
*
* @param reader 从reader获取文本
* @param size 需要提取词语的数量
* @return 一个词语列表
*/
public static List<WordInfo> extractWords(BufferedReader reader, int size) throws IOException {
return extractWords(reader, size, false);
}

/**
* 提取词语(新词发现)
*
* @param text 大文本
* @param size 需要提取词语的数量
* @param newWordsOnly 是否只提取词典中没有的词语
* @return 一个词语列表
*/
public static List<WordInfo> extractWords(String text, int size, boolean newWordsOnly) {
NewWordDiscover discover = new NewWordDiscover(4, 0.0f, .5f, 100f, newWordsOnly);
return discover.discover(text, size);
}

/**
* 提取词语(新词发现)
*
* @param reader 从reader获取文本
* @param size 需要提取词语的数量
* @param newWordsOnly 是否只提取词典中没有的词语
* @return 一个词语列表
*/
public static List<WordInfo> extractWords(BufferedReader reader, int size, boolean newWordsOnly) throws IOException {
NewWordDiscover discover = new NewWordDiscover(4, 0.0f, .5f, 100f, newWordsOnly);
return discover.discover(reader, size);
}

/**
* 提取词语(新词发现)
*
* @param reader 从reader获取文本
* @param size 需要提取词语的数量
* @param newWordsOnly 是否只提取词典中没有的词语
* @param max_word_len 词语最长长度
* @param min_freq 词语最低频率
* @param min_entropy 词语最低熵
* @param min_aggregation 词语最低互信息
* @return 一个词语列表
*/
public static List<WordInfo> extractWords(BufferedReader reader, int size, boolean newWordsOnly, int max_word_len, float min_freq, float min_entropy, float min_aggregation) throws IOException {
NewWordDiscover discover = new NewWordDiscover(max_word_len, min_freq, min_entropy, min_aggregation, newWordsOnly);
return discover.discover(reader, size);
}

/**
* 提取关键词
*
* @param document 文档内容
* @param size 希望提取几个关键词
* @return 一个列表
*/
public static List<String> extractKeyword(String document, int size) {
return TextRankKeyword.getKeywordList(document, size);
}

/**
* 自动摘要
* 分割目标文档时的默认句子分割符为,,。::“”??!!;;
*
* @param document 目标文档
* @param size 需要的关键句的个数
* @return 关键句列表
*/
public static List<String> extractSummary(String document, int size) {
return TextRankSentence.getTopSentenceList(document, size);
}

/**
* 自动摘要
* 分割目标文档时的默认句子分割符为,,。::“”??!!;;
*
* @param document 目标文档
* @param max_length 需要摘要的长度
* @return 摘要文本
*/
public static String getSummary(String document, int max_length) {
// Parameter size in this method refers to the string length of the summary required;
// The actual length of the summary generated may be short than the required length, but never longer;
return TextRankSentence.getSummary(document, max_length);
}

/**
* 自动摘要
*
* @param document 目标文档
* @param size 需要的关键句的个数
* @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;]
* @return 关键句列表
*/
public static List<String> extractSummary(String document, int size, String sentence_separator) {
return TextRankSentence.getTopSentenceList(document, size, sentence_separator);
}

/**
* 自动摘要
*
* @param document 目标文档
* @param max_length 需要摘要的长度
* @param sentence_separator 分割目标文档时的句子分割符,正则格式, 如:[。??!!;;]
* @return 摘要文本
*/
public static String getSummary(String document, int max_length, String sentence_separator) {
// Parameter size in this method refers to the string length of the summary required;
// The actual length of the summary generated may be short than the required length, but never longer;
return TextRankSentence.getSummary(document, max_length, sentence_separator);
}

}

+ 45
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ArrayCompare.java View File

@@ -0,0 +1,45 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/9/17 14:15</create-date>
*
* <copyright file="ArrayCompare.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

/**
* 比较两个数组
* @author hankcs
*/
public class ArrayCompare
{
/**
* 比较数组A与B的大小关系
* @param arrayA
* @param arrayB
* @return
*/
public static int compare(Long[] arrayA, Long[] arrayB)
{
int len1 = arrayA.length;
int len2 = arrayB.length;
int lim = Math.min(len1, len2);

int k = 0;
while (k < lim)
{
Long c1 = arrayA[k];
Long c2 = arrayB[k];
if (!c1.equals(c2))
{
return c1.compareTo(c2);
}
++k;
}
return len1 - len2;
}
}

+ 67
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ArrayDistance.java View File

@@ -0,0 +1,67 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/9/17 9:47</create-date>
*
* <copyright file="BinSearch.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

import java.util.TreeSet;

/**
* 求两个集合中最相近的两个数
*
* @author hankcs
*/
public class ArrayDistance
{
public static Long computeMinimumDistance(TreeSet<Long> setA, TreeSet<Long> setB)
{
Long[] arrayA = setA.toArray(new Long[0]);
Long[] arrayB = setB.toArray(new Long[0]);
return computeMinimumDistance(arrayA, arrayB);
}

public static Long computeMinimumDistance(Long[] arrayA, Long[] arrayB)
{
int aIndex = 0;
int bIndex = 0;
long min = Math.abs(arrayA[0] - arrayB[0]);
while (true)
{
if (arrayA[aIndex] > arrayB[bIndex])
{
bIndex++;
}
else
{
aIndex++;
}
if (aIndex >= arrayA.length || bIndex >= arrayB.length)
{
break;
}
if (Math.abs(arrayA[aIndex] - arrayB[bIndex]) < min)
{
min = Math.abs(arrayA[aIndex] - arrayB[bIndex]);
}
}

return min;
}

public static Long computeAverageDistance(Long[] arrayA, Long[] arrayB)
{
Long totalA = 0L;
Long totalB = 0L;
for (Long a : arrayA) totalA += a;
for (Long b : arrayB) totalB += b;

return Math.abs(totalA / arrayA.length - totalB / arrayB.length);
}
}

+ 62
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/Dijkstra.java View File

@@ -0,0 +1,62 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/12/9 12:18</create-date>
*
* <copyright file="Dijkstra.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

import com.hankcs.hanlp.seg.Dijkstra.Path.State;
import com.hankcs.hanlp.seg.common.EdgeFrom;
import com.hankcs.hanlp.seg.common.Graph;
import com.hankcs.hanlp.seg.common.Vertex;

import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;

/**
* 最短路径
* @author hankcs
*/
public class Dijkstra
{
public static List<Vertex> compute(Graph graph)
{
List<Vertex> resultList = new LinkedList<Vertex>();
Vertex[] vertexes = graph.getVertexes();
List<EdgeFrom>[] edgesTo = graph.getEdgesTo();
double[] d = new double[vertexes.length];
Arrays.fill(d, Double.MAX_VALUE);
d[d.length - 1] = 0;
int[] path = new int[vertexes.length];
Arrays.fill(path, -1);
PriorityQueue<State> que = new PriorityQueue<State>();
que.add(new State(0, vertexes.length - 1));
while (!que.isEmpty())
{
State p = que.poll();
if (d[p.vertex] < p.cost) continue;
for (EdgeFrom edgeFrom : edgesTo[p.vertex])
{
if (d[edgeFrom.from] > d[p.vertex] + edgeFrom.weight)
{
d[edgeFrom.from] = d[p.vertex] + edgeFrom.weight;
que.add(new State(d[edgeFrom.from], edgeFrom.from));
path[edgeFrom.from] = p.vertex;
}
}
}
for (int t = 0; t != -1; t = path[t])
{
resultList.add(vertexes[t]);
}
return resultList;
}
}

+ 232
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/EditDistance.java View File

@@ -0,0 +1,232 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/9/13 20:30</create-date>
*
* <copyright file="EditDistance.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

import com.hankcs.hanlp.dictionary.common.CommonSynonymDictionary;

import java.util.List;

/**
* 基于语义距离的编辑距离实现
* @author hankcs
*/
public class EditDistance
{
public static long compute(List<CommonSynonymDictionary.SynonymItem> synonymItemListA, List<CommonSynonymDictionary.SynonymItem> synonymItemListB)
{
long[] arrayA = new long[synonymItemListA.size()];
long[] arrayB = new long[synonymItemListB.size()];
int i = 0;
for (CommonSynonymDictionary.SynonymItem item : synonymItemListA)
{
arrayA[i++] = item.entry.id;
}
i = 0;
for (CommonSynonymDictionary.SynonymItem item : synonymItemListB)
{
arrayB[i++] = item.entry.id;
}
return compute(arrayA, arrayB);
}

public static long compute(long[] arrayA, long[] arrayB)
{
final int m = arrayA.length;
final int n = arrayB.length;
if (m == 0 || n == 0) return Long.MAX_VALUE / 3;

long[][] d = new long[m + 1][n + 1];
for (int j = 0; j <= n; ++j)
{
d[0][j] = j;
}
for (int i = 0; i <= m; ++i)
{
d[i][0] = i;
}

for (int i = 1; i <= m; ++i)
{
long ci = arrayA[i - 1];
for (int j = 1; j <= n; ++j)
{
long cj = arrayB[j - 1];
if (ci == cj)
{
d[i][j] = d[i - 1][j - 1];
}
// else if (i > 1 && j > 1 && ci == arrayA[j - 2] && cj == arrayB[i - 2])
// {
// // 交错相等
// d[i][j] = 1 + Math.min(d[i - 2][j - 2], Math.min(d[i][j - 1], d[i - 1][j]));
// }
else
{
// 等号右边的分别代表 将ci改成cj 错串加cj 错串删ci
d[i][j] = Math.min(d[i - 1][j - 1] + Math.abs(ci - cj), Math.min(d[i][j - 1] + cj, d[i - 1][j] + ci));
}
}
}

return d[m][n];
}

public static int compute(int[] arrayA, int[] arrayB)
{
final int m = arrayA.length;
final int n = arrayB.length;
if (m == 0 || n == 0) return Integer.MAX_VALUE / 3;

int[][] d = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j)
{
d[0][j] = j;
}
for (int i = 0; i <= m; ++i)
{
d[i][0] = i;
}

for (int i = 1; i <= m; ++i)
{
int ci = arrayA[i - 1];
for (int j = 1; j <= n; ++j)
{
int cj = arrayB[j - 1];
if (ci == cj)
{
d[i][j] = d[i - 1][j - 1];
}
// else if (i > 1 && j > 1 && ci == arrayA[j - 2] && cj == arrayB[i - 2])
// {
// // 交错相等
// d[i][j] = 1 + Math.min(d[i - 2][j - 2], Math.min(d[i][j - 1], d[i - 1][j]));
// }
else
{
// 等号右边的分别代表 将ci改成cj 错串加cj 错串删ci
d[i][j] = Math.min(d[i - 1][j - 1] + Math.abs(ci - cj), Math.min(d[i][j - 1] + cj, d[i - 1][j] + ci));
}
}
}

return d[m][n];
}

/**
* 编辑距离
*
* @param a 串A,其实它们两个调换位置还是一样的
* @param b 串B
* @return 它们之间的距离
*/
public static int compute(String a, String b)
{
return ed(a, b);
}

/**
* 编辑距离
*
* @param wrongWord 串A,其实它们两个调换位置还是一样的
* @param rightWord 串B
* @return 它们之间的距离
*/
public static int ed(String wrongWord, String rightWord)
{
final int m = wrongWord.length();
final int n = rightWord.length();

int[][] d = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j)
{
d[0][j] = j;
}
for (int i = 0; i <= m; ++i)
{
d[i][0] = i;
}

for (int i = 1; i <= m; ++i)
{
char ci = wrongWord.charAt(i - 1);
for (int j = 1; j <= n; ++j)
{
char cj = rightWord.charAt(j - 1);
if (ci == cj)
{
d[i][j] = d[i - 1][j - 1];
}
else if (i > 1 && j > 1 && ci == rightWord.charAt(j - 2) && cj == wrongWord.charAt(i - 2))
{
// 交错相等
d[i][j] = 1 + Math.min(d[i - 2][j - 2], Math.min(d[i][j - 1], d[i - 1][j]));
}
else
{
// 等号右边的分别代表 将ci改成cj 错串加cj 错串删ci
d[i][j] = Math.min(d[i - 1][j - 1] + 1, Math.min(d[i][j - 1] + 1, d[i - 1][j] + 1));
}
}
}

return d[m][n];
}

/**
* 编辑距离
*
* @param wrongWord 串A,其实它们两个调换位置还是一样的
* @param rightWord 串B
* @return 它们之间的距离
*/
public static int compute(char[] wrongWord, char[] rightWord)
{
final int m = wrongWord.length;
final int n = rightWord.length;

int[][] d = new int[m + 1][n + 1];
for (int j = 0; j <= n; ++j)
{
d[0][j] = j;
}
for (int i = 0; i <= m; ++i)
{
d[i][0] = i;
}

for (int i = 1; i <= m; ++i)
{
char ci = wrongWord[i - 1];
for (int j = 1; j <= n; ++j)
{
char cj = rightWord[j - 1];
if (ci == cj)
{
d[i][j] = d[i - 1][j - 1];
}
else if (i > 1 && j > 1 && ci == rightWord[j - 2] && cj == wrongWord[i - 2])
{
// 交错相等
d[i][j] = 1 + Math.min(d[i - 2][j - 2], Math.min(d[i][j - 1], d[i - 1][j]));
}
else
{
// 等号右边的分别代表 将ci改成cj 错串加cj 错串删ci
d[i][j] = Math.min(d[i - 1][j - 1] + 1, Math.min(d[i][j - 1] + 1, d[i - 1][j] + 1));
}
}
}

return d[m][n];
}
}

+ 65
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/LongestCommonSubsequence.java View File

@@ -0,0 +1,65 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/7 11:29</create-date>
*
* <copyright file="LongestCommonSubsequence.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

/**
* 最长公共子序列(Longest Common Subsequence)指的是两个字符串中的最长公共子序列,不要求子序列连续。
* @author hankcs
*/
public class LongestCommonSubsequence
{
public static int compute(char[] str1, char[] str2)
{
int substringLength1 = str1.length;
int substringLength2 = str2.length;

// 构造二维数组记录子问题A[i]和B[j]的LCS的长度
int[][] opt = new int[substringLength1 + 1][substringLength2 + 1];

// 从后向前,动态规划计算所有子问题。也可从前到后。
for (int i = substringLength1 - 1; i >= 0; i--)
{
for (int j = substringLength2 - 1; j >= 0; j--)
{
if (str1[i] == str2[j])
opt[i][j] = opt[i + 1][j + 1] + 1;// 状态转移方程
else
opt[i][j] = Math.max(opt[i + 1][j], opt[i][j + 1]);// 状态转移方程
}
}
// System.out.println("substring1:" + new String(str1));
// System.out.println("substring2:" + new String(str2));
// System.out.print("LCS:");

// int i = 0, j = 0;
// while (i < substringLength1 && j < substringLength2)
// {
// if (str1[i] == str2[j])
// {
// System.out.print(str1[i]);
// i++;
// j++;
// }
// else if (opt[i + 1][j] >= opt[i][j + 1])
// i++;
// else
// j++;
// }
// System.out.println();
return opt[0][0];
}

public static int compute(String str1, String str2)
{
return compute(str1.toCharArray(), str2.toCharArray());
}
}

+ 102
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/LongestCommonSubstring.java View File

@@ -0,0 +1,102 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/7 10:10</create-date>
*
* <copyright file="ArrayCount.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

/**
* 求最长公共字串的长度<br>
* 最长公共子串(Longest Common Substring)指的是两个字符串中的最长公共子串,要求子串一定连续
*
* @author hankcs
*/
public class LongestCommonSubstring
{
public static int compute(char[] str1, char[] str2)
{
int size1 = str1.length;
int size2 = str2.length;
if (size1 == 0 || size2 == 0) return 0;

// the start position of substring in original string
// int start1 = -1;
// int start2 = -1;
// the longest length of com.hankcs.common substring
int longest = 0;

// record how many comparisons the solution did;
// it can be used to know which algorithm is better
// int comparisons = 0;

for (int i = 0; i < size1; ++i)
{
int m = i;
int n = 0;
int length = 0;
while (m < size1 && n < size2)
{
// ++comparisons;
if (str1[m] != str2[n])
{
length = 0;
}
else
{
++length;
if (longest < length)
{
longest = length;
// start1 = m - longest + 1;
// start2 = n - longest + 1;
}
}

++m;
++n;
}
}

// shift string2 to find the longest com.hankcs.common substring
for (int j = 1; j < size2; ++j)
{
int m = 0;
int n = j;
int length = 0;
while (m < size1 && n < size2)
{
// ++comparisons;
if (str1[m] != str2[n])
{
length = 0;
}
else
{
++length;
if (longest < length)
{
longest = length;
// start1 = m - longest + 1;
// start2 = n - longest + 1;
}
}

++m;
++n;
}
}
// System.out.printf("from %d of %s and %d of %s, compared for %d times\n", start1, new String(str1), start2, new String(str2), comparisons);
return longest;
}

public static int compute(String str1, String str2)
{
return compute(str1.toCharArray(), str2.toCharArray());
}
}

+ 109
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/MaxHeap.java View File

@@ -0,0 +1,109 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>2015/11/22 13:23</create-date>
*
* <copyright file="MaxHeap.java" company="码农场">
* Copyright (c) 2008-2015, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

import java.util.*;

/**
* 用固定容量的优先队列模拟的最大堆,用于解决求topN大的问题
*
* @author hankcs
*/
public class MaxHeap<E> implements Iterable<E>
{
/**
* 优先队列
*/
private PriorityQueue<E> queue;
/**
* 堆的最大容量
*/
private int maxSize;

/**
* 构造最大堆
* @param maxSize 保留多少个元素
* @param comparator 比较器,生成最大堆使用o1-o2,生成最小堆使用o2-o1,并修改 e.compareTo(peek) 比较规则
*/
public MaxHeap(int maxSize, Comparator<E> comparator)
{
if (maxSize <= 0)
throw new IllegalArgumentException();
this.maxSize = maxSize;
this.queue = new PriorityQueue<E>(maxSize, comparator);
}

/**
* 添加一个元素
* @param e 元素
* @return 是否添加成功
*/
public boolean add(E e)
{
if (queue.size() < maxSize)
{ // 未达到最大容量,直接添加
queue.add(e);
return true;
}
else
{ // 队列已满
E peek = queue.peek();
if (queue.comparator().compare(e, peek) > 0)
{ // 将新元素与当前堆顶元素比较,保留较小的元素
queue.poll();
queue.add(e);
return true;
}
}
return false;
}

/**
* 添加许多元素
* @param collection
*/
public MaxHeap<E> addAll(Collection<E> collection)
{
for (E e : collection)
{
add(e);
}

return this;
}

/**
* 转为有序列表,自毁性操作
* @return
*/
public List<E> toList()
{
ArrayList<E> list = new ArrayList<E>(queue.size());
while (!queue.isEmpty())
{
list.add(0, queue.poll());
}

return list;
}

@Override
public Iterator<E> iterator()
{
return queue.iterator();
}

public int size()
{
return queue.size();
}
}

+ 271
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/Viterbi.java View File

@@ -0,0 +1,271 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/9/10 17:12</create-date>
*
* <copyright file="Viterbi.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.algorithm;

import com.hankcs.hanlp.corpus.dictionary.item.EnumItem;
import com.hankcs.hanlp.corpus.tag.Nature;
import com.hankcs.hanlp.dictionary.TransformMatrix;
import com.hankcs.hanlp.dictionary.TransformMatrixDictionary;
import com.hankcs.hanlp.seg.common.Vertex;

import java.util.*;

/**
* 维特比算法
*
* @author hankcs
*/
public class Viterbi
{
/**
* 求解HMM模型,所有概率请提前取对数
*
* @param obs 观测序列
* @param states 隐状态
* @param start_p 初始概率(隐状态)
* @param trans_p 转移概率(隐状态)
* @param emit_p 发射概率 (隐状态表现为显状态的概率)
* @return 最可能的序列
*/
public static int[] compute(int[] obs, int[] states, double[] start_p, double[][] trans_p, double[][] emit_p)
{
int _max_states_value = 0;
for (int s : states)
{
_max_states_value = Math.max(_max_states_value, s);
}
++_max_states_value;
double[][] V = new double[obs.length][_max_states_value];
int[][] path = new int[_max_states_value][obs.length];

for (int y : states)
{
V[0][y] = start_p[y] + emit_p[y][obs[0]];
path[y][0] = y;
}

for (int t = 1; t < obs.length; ++t)
{
int[][] newpath = new int[_max_states_value][obs.length];

for (int y : states)
{
double prob = Double.MAX_VALUE;
int state;
for (int y0 : states)
{
double nprob = V[t - 1][y0] + trans_p[y0][y] + emit_p[y][obs[t]];
if (nprob < prob)
{
prob = nprob;
state = y0;
// 记录最大概率
V[t][y] = prob;
// 记录路径
System.arraycopy(path[state], 0, newpath[y], 0, t);
newpath[y][t] = y;
}
}
}

path = newpath;
}

double prob = Double.MAX_VALUE;
int state = 0;
for (int y : states)
{
if (V[obs.length - 1][y] < prob)
{
prob = V[obs.length - 1][y];
state = y;
}
}

return path[state];
}

/**
* 特化版的求解HMM模型
*
* @param vertexList 包含Vertex.B节点的路径
* @param transformMatrixDictionary 词典对应的转移矩阵
*/
public static void compute(List<Vertex> vertexList, TransformMatrix transformMatrixDictionary)
{
if (Nature.values().length != transformMatrixDictionary.states.length)
transformMatrixDictionary.extend(Nature.values().length);
int length = vertexList.size() - 1;
double[][] cost = new double[2][]; // 滚动数组
Iterator<Vertex> iterator = vertexList.iterator();
Vertex start = iterator.next();
Nature pre = start.attribute.nature[0];
// 第一个是确定的
// start.confirmNature(pre);
// 第二个也可以简单地算出来
Vertex preItem;
Nature[] preTagSet;
{
Vertex item = iterator.next();
cost[0] = new double[item.attribute.nature.length];
int j = 0;
int curIndex = 0;
for (Nature cur : item.attribute.nature)
{
cost[0][j] = transformMatrixDictionary.transititon_probability[pre.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[curIndex] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur.ordinal()));
++j;
++curIndex;
}
preTagSet = item.attribute.nature;
preItem = item;
}
// 第三个开始复杂一些
for (int i = 1; i < length; ++i)
{
int index_i = i & 1;
int index_i_1 = 1 - index_i;
Vertex item = iterator.next();
cost[index_i] = new double[item.attribute.nature.length];
double perfect_cost_line = Double.MAX_VALUE;
int k = 0;
Nature[] curTagSet = item.attribute.nature;
for (Nature cur : curTagSet)
{
cost[index_i][k] = Double.MAX_VALUE;
int j = 0;
for (Nature p : preTagSet)
{
double now = cost[index_i_1][j] + transformMatrixDictionary.transititon_probability[p.ordinal()][cur.ordinal()] - Math.log((item.attribute.frequency[k] + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur.ordinal()));
if (now < cost[index_i][k])
{
cost[index_i][k] = now;
if (now < perfect_cost_line)
{
perfect_cost_line = now;
pre = p;
}
}
++j;
}
++k;
}
preItem.confirmNature(pre);
preTagSet = curTagSet;
preItem = item;
}
}

/**
* 标准版的Viterbi算法,查准率高,效率稍低
*
* @param roleTagList 观测序列
* @param transformMatrixDictionary 转移矩阵
* @param <E> EnumItem的具体类型
* @return 预测结果
*/
public static <E extends Enum<E>> List<E> computeEnum(List<EnumItem<E>> roleTagList, TransformMatrixDictionary<E> transformMatrixDictionary)
{
int length = roleTagList.size() - 1;
List<E> tagList = new ArrayList<E>(roleTagList.size());
double[][] cost = new double[2][]; // 滚动数组
Iterator<EnumItem<E>> iterator = roleTagList.iterator();
EnumItem<E> start = iterator.next();
E pre = start.labelMap.entrySet().iterator().next().getKey();
// 第一个是确定的
tagList.add(pre);
// 第二个也可以简单地算出来
Set<E> preTagSet;
{
EnumItem<E> item = iterator.next();
cost[0] = new double[item.labelMap.size()];
int j = 0;
for (E cur : item.labelMap.keySet())
{
cost[0][j] = transformMatrixDictionary.transititon_probability[pre.ordinal()][cur.ordinal()] - Math.log((item.getFrequency(cur) + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
++j;
}
preTagSet = item.labelMap.keySet();
}
// 第三个开始复杂一些
for (int i = 1; i < length; ++i)
{
int index_i = i & 1;
int index_i_1 = 1 - index_i;
EnumItem<E> item = iterator.next();
cost[index_i] = new double[item.labelMap.size()];
double perfect_cost_line = Double.MAX_VALUE;
int k = 0;
Set<E> curTagSet = item.labelMap.keySet();
for (E cur : curTagSet)
{
cost[index_i][k] = Double.MAX_VALUE;
int j = 0;
for (E p : preTagSet)
{
double now = cost[index_i_1][j] + transformMatrixDictionary.transititon_probability[p.ordinal()][cur.ordinal()] - Math.log((item.getFrequency(cur) + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
if (now < cost[index_i][k])
{
cost[index_i][k] = now;
if (now < perfect_cost_line)
{
perfect_cost_line = now;
pre = p;
}
}
++j;
}
++k;
}
tagList.add(pre);
preTagSet = curTagSet;
}
tagList.add(tagList.get(0)); // 对于最后一个##末##
return tagList;
}

/**
* 仅仅利用了转移矩阵的“维特比”算法
*
* @param roleTagList 观测序列
* @param transformMatrixDictionary 转移矩阵
* @param <E> EnumItem的具体类型
* @return 预测结果
*/
public static <E extends Enum<E>> List<E> computeEnumSimply(List<EnumItem<E>> roleTagList, TransformMatrixDictionary<E> transformMatrixDictionary)
{
int length = roleTagList.size() - 1;
List<E> tagList = new LinkedList<E>();
Iterator<EnumItem<E>> iterator = roleTagList.iterator();
EnumItem<E> start = iterator.next();
E pre = start.labelMap.entrySet().iterator().next().getKey();
E perfect_tag = pre;
// 第一个是确定的
tagList.add(pre);
for (int i = 0; i < length; ++i)
{
double perfect_cost = Double.MAX_VALUE;
EnumItem<E> item = iterator.next();
for (E cur : item.labelMap.keySet())
{
double now = transformMatrixDictionary.transititon_probability[pre.ordinal()][cur.ordinal()] - Math.log((item.getFrequency(cur) + 1e-8) / transformMatrixDictionary.getTotalFrequency(cur));
if (perfect_cost > now)
{
perfect_cost = now;
perfect_tag = cur;
}
}
pre = perfect_tag;
tagList.add(pre);
}
return tagList;
}
}

+ 100
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/Interval.java View File

@@ -0,0 +1,100 @@
package com.hankcs.hanlp.algorithm.ahocorasick.interval;

/**
* 区间
*/
public class Interval implements Intervalable
{
/**
* 起点
*/
private int start;
/**
* 终点
*/
private int end;

/**
* 构造一个区间
* @param start
* @param end
*/
public Interval(final int start, final int end)
{
this.start = start;
this.end = end;
}

public int getStart()
{
return this.start;
}

public int getEnd()
{
return this.end;
}

public int size()
{
return end - start + 1;
}

/**
* 是否与另一个区间交叉(有一部分重叠)
* @param other
* @return
*/
public boolean overlapsWith(Interval other)
{
return this.start <= other.getEnd() &&
this.end >= other.getStart();
}

/**
* 区间是否覆盖了这个点
* @param point
* @return
*/
public boolean overlapsWith(int point)
{
return this.start <= point && point <= this.end;
}

@Override
public boolean equals(Object o)
{
if (!(o instanceof Intervalable))
{
return false;
}
Intervalable other = (Intervalable) o;
return this.start == other.getStart() &&
this.end == other.getEnd();
}

@Override
public int hashCode()
{
return this.start % 100 + this.end % 100;
}

@Override
public int compareTo(Object o)
{
if (!(o instanceof Intervalable))
{
return -1;
}
Intervalable other = (Intervalable) o;
int comparison = this.start - other.getStart();
return comparison != 0 ? comparison : this.end - other.getEnd();
}

@Override
public String toString()
{
return this.start + ":" + this.end;
}

}

+ 216
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalNode.java View File

@@ -0,0 +1,216 @@
package com.hankcs.hanlp.algorithm.ahocorasick.interval;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* 线段树上面的节点,实际上是一些区间的集合,并且按中点维护了两个节点
*/
public class IntervalNode
{
/**
* 方向
*/
private enum Direction
{
LEFT, RIGHT
}

/**
* 区间集合的最左端
*/
private IntervalNode left = null;
/**
* 最右端
*/
private IntervalNode right = null;
/**
* 中点
*/
private int point;
/**
* 区间集合
*/
private List<Intervalable> intervals = new ArrayList<Intervalable>();

/**
* 构造一个节点
* @param intervals
*/
public IntervalNode(List<Intervalable> intervals)
{
this.point = determineMedian(intervals);

List<Intervalable> toLeft = new ArrayList<Intervalable>(); // 以中点为界靠左的区间
List<Intervalable> toRight = new ArrayList<Intervalable>(); // 靠右的区间

for (Intervalable interval : intervals)
{
if (interval.getEnd() < this.point)
{
toLeft.add(interval);
}
else if (interval.getStart() > this.point)
{
toRight.add(interval);
}
else
{
this.intervals.add(interval);
}
}

if (toLeft.size() > 0)
{
this.left = new IntervalNode(toLeft);
}
if (toRight.size() > 0)
{
this.right = new IntervalNode(toRight);
}
}

/**
* 计算中点
* @param intervals 区间集合
* @return 中点坐标
*/
public int determineMedian(List<Intervalable> intervals)
{
int start = -1;
int end = -1;
for (Intervalable interval : intervals)
{
int currentStart = interval.getStart();
int currentEnd = interval.getEnd();
if (start == -1 || currentStart < start)
{
start = currentStart;
}
if (end == -1 || currentEnd > end)
{
end = currentEnd;
}
}
return (start + end) / 2;
}

/**
* 寻找与interval有重叠的区间
* @param interval
* @return
*/
public List<Intervalable> findOverlaps(Intervalable interval)
{

List<Intervalable> overlaps = new ArrayList<Intervalable>();

if (this.point < interval.getStart())
{
// 右边找找
addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval));
addToOverlaps(interval, overlaps, checkForOverlapsToTheRight(interval));
}
else if (this.point > interval.getEnd())
{
// 左边找找
addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval));
addToOverlaps(interval, overlaps, checkForOverlapsToTheLeft(interval));
}
else
{
// 否则在当前区间
addToOverlaps(interval, overlaps, this.intervals);
addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval));
addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval));
}

return overlaps;
}

/**
* 添加到重叠区间列表中
* @param interval 跟此区间重叠
* @param overlaps 重叠区间列表
* @param newOverlaps 希望将这些区间加入
*/
protected void addToOverlaps(Intervalable interval, List<Intervalable> overlaps, List<Intervalable> newOverlaps)
{
for (Intervalable currentInterval : newOverlaps)
{
if (!currentInterval.equals(interval))
{
overlaps.add(currentInterval);
}
}
}

/**
* 往左边寻找重叠
* @param interval
* @return
*/
protected List<Intervalable> checkForOverlapsToTheLeft(Intervalable interval)
{
return checkForOverlaps(interval, Direction.LEFT);
}

/**
* 往右边寻找重叠
* @param interval
* @return
*/
protected List<Intervalable> checkForOverlapsToTheRight(Intervalable interval)
{
return checkForOverlaps(interval, Direction.RIGHT);
}

/**
* 寻找重叠
* @param interval 一个区间,与该区间重叠
* @param direction 方向,表明重叠区间在interval的左边还是右边
* @return
*/
protected List<Intervalable> checkForOverlaps(Intervalable interval, Direction direction)
{

List<Intervalable> overlaps = new ArrayList<Intervalable>();
for (Intervalable currentInterval : this.intervals)
{
switch (direction)
{
case LEFT:
if (currentInterval.getStart() <= interval.getEnd())
{
overlaps.add(currentInterval);
}
break;
case RIGHT:
if (currentInterval.getEnd() >= interval.getStart())
{
overlaps.add(currentInterval);
}
break;
}
}
return overlaps;
}

/**
* 是对IntervalNode.findOverlaps(Intervalable)的一个包装,防止NPE
* @see com.hankcs.hanlp.algorithm.ahocorasick.interval.IntervalNode#findOverlaps(Intervalable)
* @param node
* @param interval
* @return
*/
protected static List<Intervalable> findOverlappingRanges(IntervalNode node, Intervalable interval)
{
if (node != null)
{
return node.findOverlaps(interval);
}
return Collections.emptyList();
}

}

+ 77
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalTree.java View File

@@ -0,0 +1,77 @@
package com.hankcs.hanlp.algorithm.ahocorasick.interval;

import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;

/**
* 线段树,用于检查区间重叠
*/
public class IntervalTree
{
/**
* 根节点
*/
private IntervalNode rootNode = null;

/**
* 构造线段树
*
* @param intervals
*/
public IntervalTree(List<Intervalable> intervals)
{
this.rootNode = new IntervalNode(intervals);
}

/**
* 从区间列表中移除重叠的区间
*
* @param intervals
* @return
*/
public List<Intervalable> removeOverlaps(List<Intervalable> intervals)
{

// 排序,按照先大小后左端点的顺序
Collections.sort(intervals, new IntervalableComparatorBySize());

Set<Intervalable> removeIntervals = new TreeSet<Intervalable>();

for (Intervalable interval : intervals)
{
// 如果区间已经被移除了,就忽略它
if (removeIntervals.contains(interval))
{
continue;
}

// 否则就移除它
removeIntervals.addAll(findOverlaps(interval));
}

// 移除所有的重叠区间
for (Intervalable removeInterval : removeIntervals)
{
intervals.remove(removeInterval);
}

// 排序,按照左端顺序
Collections.sort(intervals, new IntervalableComparatorByPosition());

return intervals;
}

/**
* 寻找重叠区间
*
* @param interval 与这个区间重叠
* @return 重叠的区间列表
*/
public List<Intervalable> findOverlaps(Intervalable interval)
{
return rootNode.findOverlaps(interval);
}

}

+ 26
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/Intervalable.java View File

@@ -0,0 +1,26 @@
package com.hankcs.hanlp.algorithm.ahocorasick.interval;

/**
* 区间接口
*/
public interface Intervalable extends Comparable
{
/**
* 起点
* @return
*/
public int getStart();

/**
* 终点
* @return
*/
public int getEnd();

/**
* 长度
* @return
*/
public int size();

}

+ 15
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalableComparatorByPosition.java View File

@@ -0,0 +1,15 @@
package com.hankcs.hanlp.algorithm.ahocorasick.interval;

import java.util.Comparator;

/**
* 按起点比较区间
*/
public class IntervalableComparatorByPosition implements Comparator<Intervalable>
{
@Override
public int compare(Intervalable intervalable, Intervalable intervalable2)
{
return intervalable.getStart() - intervalable2.getStart();
}
}

+ 20
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/interval/IntervalableComparatorBySize.java View File

@@ -0,0 +1,20 @@
package com.hankcs.hanlp.algorithm.ahocorasick.interval;

import java.util.Comparator;

/**
* 按照长度比较区间,如果长度相同,则比较起点
*/
public class IntervalableComparatorBySize implements Comparator<Intervalable>
{
@Override
public int compare(Intervalable intervalable, Intervalable intervalable2)
{
int comparison = intervalable2.size() - intervalable.size();
if (comparison == 0)
{
comparison = intervalable.getStart() - intervalable2.getStart();
}
return comparison;
}
}

+ 42
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/Emit.java View File

@@ -0,0 +1,42 @@
package com.hankcs.hanlp.algorithm.ahocorasick.trie;

import com.hankcs.hanlp.algorithm.ahocorasick.interval.Interval;
import com.hankcs.hanlp.algorithm.ahocorasick.interval.Intervalable;

/**
* 一个模式串匹配结果
*/
public class Emit extends Interval implements Intervalable
{
/**
* 匹配到的模式串
*/
private final String keyword;

/**
* 构造一个模式串匹配结果
* @param start 起点
* @param end 重点
* @param keyword 模式串
*/
public Emit(final int start, final int end, final String keyword)
{
super(start, end);
this.keyword = keyword;
}

/**
* 获取对应的模式串
* @return 模式串
*/
public String getKeyword()
{
return this.keyword;
}

@Override
public String toString()
{
return super.toString() + "=" + this.keyword;
}
}

+ 22
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/FragmentToken.java View File

@@ -0,0 +1,22 @@
package com.hankcs.hanlp.algorithm.ahocorasick.trie;

public class FragmentToken extends Token
{

public FragmentToken(String fragment)
{
super(fragment);
}

@Override
public boolean isMatch()
{
return false;
}

@Override
public Emit getEmit()
{
return null;
}
}

+ 29
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/MatchToken.java View File

@@ -0,0 +1,29 @@
package com.hankcs.hanlp.algorithm.ahocorasick.trie;

/**
* 匹配到的片段
*/
public class MatchToken extends Token
{

private Emit emit;

public MatchToken(String fragment, Emit emit)
{
super(fragment);
this.emit = emit;
}

@Override
public boolean isMatch()
{
return true;
}

@Override
public Emit getEmit()
{
return this.emit;
}

}

+ 190
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/State.java View File

@@ -0,0 +1,190 @@
package com.hankcs.hanlp.algorithm.ahocorasick.trie;

import java.util.*;

/**
* <p>
* 一个状态有如下几个功能
* </p>
* <p/>
* <ul>
* <li>success; 成功转移到另一个状态</li>
* <li>failure; 不可顺着字符串跳转的话,则跳转到一个浅一点的节点</li>
* <li>emits; 命中一个模式串</li>
* </ul>
* <p/>
* <p>
* 根节点稍有不同,根节点没有 failure 功能,它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。
* </p>
*
* @author Robert Bor
*/
public class State
{

/**
* 模式串的长度,也是这个状态的深度
*/
protected final int depth;

/**
* fail 函数,如果没有匹配到,则跳转到此状态。
*/
private State failure = null;

/**
* 只要这个状态可达,则记录模式串
*/
private Set<String> emits = null;
/**
* goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
*/
private Map<Character, State> success = new TreeMap<Character, State>();

/**
* 构造深度为0的节点
*/
public State()
{
this(0);
}

/**
* 构造深度为depth的节点
* @param depth
*/
public State(int depth)
{
this.depth = depth;
}

/**
* 获取节点深度
* @return
*/
public int getDepth()
{
return this.depth;
}

/**
* 添加一个匹配到的模式串(这个状态对应着这个模式串)
* @param keyword
*/
public void addEmit(String keyword)
{
if (this.emits == null)
{
this.emits = new TreeSet<String>();
}
this.emits.add(keyword);
}

/**
* 添加一些匹配到的模式串
* @param emits
*/
public void addEmit(Collection<String> emits)
{
for (String emit : emits)
{
addEmit(emit);
}
}

/**
* 获取这个节点代表的模式串(们)
* @return
*/
public Collection<String> emit()
{
return this.emits == null ? Collections.<String>emptyList() : this.emits;
}

/**
* 获取failure状态
* @return
*/
public State failure()
{
return this.failure;
}

/**
* 设置failure状态
* @param failState
*/
public void setFailure(State failState)
{
this.failure = failState;
}

/**
* 转移到下一个状态
* @param character 希望按此字符转移
* @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
* @return 转移结果
*/
private State nextState(Character character, boolean ignoreRootState)
{
State nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.depth == 0)
{
nextState = this;
}
return nextState;
}

/**
* 按照character转移,根节点转移失败会返回自己(永远不会返回null)
* @param character
* @return
*/
public State nextState(Character character)
{
return nextState(character, false);
}

/**
* 按照character转移,任何节点转移失败会返回null
* @param character
* @return
*/
public State nextStateIgnoreRootState(Character character)
{
return nextState(character, true);
}

public State addState(Character character)
{
State nextState = nextStateIgnoreRootState(character);
if (nextState == null)
{
nextState = new State(this.depth + 1);
this.success.put(character, nextState);
}
return nextState;
}

public Collection<State> getStates()
{
return this.success.values();
}

public Collection<Character> getTransitions()
{
return this.success.keySet();
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder("State{");
sb.append("depth=").append(depth);
sb.append(", emits=").append(emits);
sb.append(", success=").append(success.keySet());
sb.append(", failure=").append(failure);
sb.append('}');
return sb.toString();
}
}

+ 32
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/Token.java View File

@@ -0,0 +1,32 @@
package com.hankcs.hanlp.algorithm.ahocorasick.trie;

/**
* 一个片段
*/
public abstract class Token
{
/**
* 对应的片段
*/
private String fragment;

public Token(String fragment)
{
this.fragment = fragment;
}

public String getFragment()
{
return this.fragment;
}

public abstract boolean isMatch();

public abstract Emit getEmit();

@Override
public String toString()
{
return fragment + "/" + isMatch();
}
}

+ 335
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/Trie.java View File

@@ -0,0 +1,335 @@
package com.hankcs.hanlp.algorithm.ahocorasick.trie;

import com.hankcs.hanlp.algorithm.ahocorasick.interval.IntervalTree;
import com.hankcs.hanlp.algorithm.ahocorasick.interval.Intervalable;

import java.util.*;
import java.util.concurrent.LinkedBlockingDeque;

/**
* 基于 Aho-Corasick 白皮书, 贝尔实验室: ftp://163.13.200.222/assistant/bearhero/prog/%A8%E4%A5%A6/ac_bm.pdf
*
* @author Robert Bor
*/
public class Trie
{

private TrieConfig trieConfig;

private State rootState;

/**
* 是否建立了failure表
*/
private boolean failureStatesConstructed = false;

/**
* 构造一棵trie树
*/
public Trie(TrieConfig trieConfig)
{
this.trieConfig = trieConfig;
this.rootState = new State();
}

public Trie()
{
this(new TrieConfig());
}

public Trie(Collection<String> keywords)
{
this();
addAllKeyword(keywords);
}

public Trie removeOverlaps()
{
this.trieConfig.setAllowOverlaps(false);
return this;
}

/**
* 只保留最长匹配
* @return
*/
public Trie remainLongest()
{
this.trieConfig.remainLongest = true;
return this;
}

public void addKeyword(String keyword)
{
if (keyword == null || keyword.length() == 0)
{
return;
}
State currentState = this.rootState;
for (Character character : keyword.toCharArray())
{
currentState = currentState.addState(character);
}
currentState.addEmit(keyword);
}

public void addAllKeyword(Collection<String> keywordSet)
{
for (String keyword : keywordSet)
{
addKeyword(keyword);
}
}

/**
* 一个最长分词器
*
* @param text 待分词文本
* @return
*/
public Collection<Token> tokenize(String text)
{

Collection<Token> tokens = new ArrayList<Token>();

Collection<Emit> collectedEmits = parseText(text);
// 下面是最长分词的关键
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
// 移除结束

int lastCollectedPosition = -1;
for (Emit emit : collectedEmits)
{
if (emit.getStart() - lastCollectedPosition > 1)
{
tokens.add(createFragment(emit, text, lastCollectedPosition));
}
tokens.add(createMatch(emit, text));
lastCollectedPosition = emit.getEnd();
}
if (text.length() - lastCollectedPosition > 1)
{
tokens.add(createFragment(null, text, lastCollectedPosition));
}

return tokens;
}

private Token createFragment(Emit emit, String text, int lastCollectedPosition)
{
return new FragmentToken(text.substring(lastCollectedPosition + 1, emit == null ? text.length() : emit.getStart()));
}

private Token createMatch(Emit emit, String text)
{
return new MatchToken(text.substring(emit.getStart(), emit.getEnd() + 1), emit);
}

/**
* 模式匹配
*
* @param text 待匹配的文本
* @return 匹配到的模式串
*/
@SuppressWarnings("unchecked")
public Collection<Emit> parseText(String text)
{
checkForConstructedFailureStates();

int position = 0;
State currentState = this.rootState;
List<Emit> collectedEmits = new ArrayList<Emit>();
for (int i = 0; i < text.length(); ++i)
{
currentState = getState(currentState, text.charAt(i));
storeEmits(position, currentState, collectedEmits);
++position;
}

if (!trieConfig.isAllowOverlaps())
{
IntervalTree intervalTree = new IntervalTree((List<Intervalable>) (List<?>) collectedEmits);
intervalTree.removeOverlaps((List<Intervalable>) (List<?>) collectedEmits);
}

if (trieConfig.remainLongest)
{
remainLongest(collectedEmits);
}

return collectedEmits;
}

/**
* 只保留最长词
* @param collectedEmits
*/
private static void remainLongest(Collection<Emit> collectedEmits)
{
if (collectedEmits.size() < 2) return;
Map<Integer, Emit> emitMapStart = new TreeMap<Integer, Emit>();
for (Emit emit : collectedEmits)
{
Emit pre = emitMapStart.get(emit.getStart());
if (pre == null || pre.size() < emit.size())
{
emitMapStart.put(emit.getStart(), emit);
}
}
if (emitMapStart.size() < 2)
{
collectedEmits.clear();
collectedEmits.addAll(emitMapStart.values());
return;
}
Map<Integer, Emit> emitMapEnd = new TreeMap<Integer, Emit>();
for (Emit emit : emitMapStart.values())
{
Emit pre = emitMapEnd.get(emit.getEnd());
if (pre == null || pre.size() < emit.size())
{
emitMapEnd.put(emit.getEnd(), emit);
}
}

collectedEmits.clear();
collectedEmits.addAll(emitMapEnd.values());
}


/**
* 跳转到下一个状态
*
* @param currentState 当前状态
* @param character 接受字符
* @return 跳转结果
*/
private static State getState(State currentState, Character character)
{
State newCurrentState = currentState.nextState(character); // 先按success跳转
while (newCurrentState == null) // 跳转失败的话,按failure跳转
{
currentState = currentState.failure();
newCurrentState = currentState.nextState(character);
}
return newCurrentState;
}

/**
* 检查是否建立了failure表
*/
private void checkForConstructedFailureStates()
{
if (!this.failureStatesConstructed)
{
constructFailureStates();
}
}

/**
* 建立failure表
*/
private void constructFailureStates()
{
Queue<State> queue = new LinkedBlockingDeque<State>();

// 第一步,将深度为1的节点的failure设为根节点
for (State depthOneState : this.rootState.getStates())
{
depthOneState.setFailure(this.rootState);
queue.add(depthOneState);
}
this.failureStatesConstructed = true;

// 第二步,为深度 > 1 的节点建立failure表,这是一个bfs
while (!queue.isEmpty())
{
State currentState = queue.remove();

for (Character transition : currentState.getTransitions())
{
State targetState = currentState.nextState(transition);
queue.add(targetState);

State traceFailureState = currentState.failure();
while (traceFailureState.nextState(transition) == null)
{
traceFailureState = traceFailureState.failure();
}
State newFailureState = traceFailureState.nextState(transition);
targetState.setFailure(newFailureState);
targetState.addEmit(newFailureState.emit());
}
}
}

public void dfs(IWalker walker)
{
checkForConstructedFailureStates();
dfs(rootState, "", walker);
}

private void dfs(State currentState, String path, IWalker walker)
{
walker.meet(path, currentState);
for (Character transition : currentState.getTransitions())
{
State targetState = currentState.nextState(transition);
dfs(targetState, path + transition, walker);
}
}


public static interface IWalker
{
/**
* 遇到了一个节点
* @param path
* @param state
*/
void meet(String path, State state);
}

/**
* 保存匹配结果
*
* @param position 当前位置,也就是匹配到的模式串的结束位置+1
* @param currentState 当前状态
* @param collectedEmits 保存位置
*/
private static void storeEmits(int position, State currentState, List<Emit> collectedEmits)
{
Collection<String> emits = currentState.emit();
if (emits != null && !emits.isEmpty())
{
for (String emit : emits)
{
collectedEmits.add(new Emit(position - emit.length() + 1, position, emit));
}
}
}

/**
* 文本是否包含任何模式
*
* @param text 待匹配的文本
* @return 文本包含模式時回傳true
*/
public boolean hasKeyword(String text)
{
checkForConstructedFailureStates();

State currentState = this.rootState;
for (int i = 0; i < text.length(); ++i)
{
State nextState = getState(currentState, text.charAt(i));
if (nextState != null && nextState != currentState && nextState.emit().size() != 0) {
return true;
}
currentState = nextState;
}
return false;
}

}

+ 37
- 0
core/src/main/java/com/hankcs/hanlp/algorithm/ahocorasick/trie/TrieConfig.java View File

@@ -0,0 +1,37 @@
package com.hankcs.hanlp.algorithm.ahocorasick.trie;

/**
* 配置
*/
public class TrieConfig
{
/**
* 允许重叠
*/
private boolean allowOverlaps = true;

/**
* 只保留最长匹配
*/
public boolean remainLongest = false;

/**
* 是否允许重叠
*
* @return
*/
public boolean isAllowOverlaps()
{
return allowOverlaps;
}

/**
* 设置是否允许重叠
*
* @param allowOverlaps
*/
public void setAllowOverlaps(boolean allowOverlaps)
{
this.allowOverlaps = allowOverlaps;
}
}

+ 152
- 0
core/src/main/java/com/hankcs/hanlp/classification/classifiers/AbstractClassifier.java View File

@@ -0,0 +1,152 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>2016/1/29 18:00</create-date>
*
* <copyright file="AbstractClassifier.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.classifiers;

import com.hankcs.hanlp.classification.corpus.Document;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.corpus.MemoryDataSet;
import com.hankcs.hanlp.classification.models.AbstractModel;
import com.hankcs.hanlp.classification.utilities.CollectionUtility;
import com.hankcs.hanlp.utility.MathUtility;

import java.io.IOException;
import java.util.Map;
import java.util.TreeMap;

import static com.hankcs.hanlp.classification.utilities.io.ConsoleLogger.logger;

/**
* @author hankcs
*/
public abstract class AbstractClassifier implements IClassifier
{
@Override
public IClassifier enableProbability(boolean enable)
{
configProbabilityEnabled = enable;
return this;
}

/**
* 是否计算概率
*/
boolean configProbabilityEnabled = true;

/**
* 使用一个训练出来的分类器来预测分类
*
* @param text
* @return
* @throws IllegalArgumentException
* @throws IllegalStateException
*/
@Override
public String classify(String text) throws IllegalArgumentException, IllegalStateException
{
Map<String, Double> scoreMap = predict(text);

return CollectionUtility.max(scoreMap);
}

@Override
public String classify(Document document) throws IllegalArgumentException, IllegalStateException
{
Map<String, Double> scoreMap = predict(document);

return CollectionUtility.max(scoreMap);
}

@Override
public void train(String folderPath, String charsetName) throws IOException
{
IDataSet dataSet = new MemoryDataSet();
dataSet.load(folderPath, charsetName);
train(dataSet);
}

@Override
public void train(Map<String, String[]> trainingDataSet) throws IllegalArgumentException
{
IDataSet dataSet = new MemoryDataSet();
logger.start("正在构造训练数据集...");
int total = trainingDataSet.size();
int cur = 0;
for (Map.Entry<String, String[]> entry : trainingDataSet.entrySet())
{
String category = entry.getKey();
logger.out("[%s]...", category);
for (String doc : entry.getValue())
{
dataSet.add(category, doc);
}
++cur;
logger.out("%.2f%%...", MathUtility.percentage(cur, total));
}
logger.finish(" 加载完毕\n");
train(dataSet);
}

@Override
public void train(String folderPath) throws IOException
{
train(folderPath, "UTF-8");
}

@Override
public Map<String, Double> predict(Document document)
{
AbstractModel model = getModel();
if (model == null)
{
throw new IllegalStateException("未训练模型!无法执行预测!");
}
if (document == null)
{
throw new IllegalArgumentException("参数 text == null");
}

double[] probs = categorize(document);
Map<String, Double> scoreMap = new TreeMap<String, Double>();
for (int i = 0; i < probs.length; i++)
{
scoreMap.put(model.catalog[i], probs[i]);
}
return scoreMap;
}

@Override
public int label(Document document) throws IllegalArgumentException, IllegalStateException
{
AbstractModel model = getModel();
if (model == null)
{
throw new IllegalStateException("未训练模型!无法执行预测!");
}
if (document == null)
{
throw new IllegalArgumentException("参数 text == null");
}

double[] probs = categorize(document);
double max = Double.NEGATIVE_INFINITY;
int best = -1;
for (int i = 0; i < probs.length; i++)
{
if (probs[i] > max)
{
max = probs[i];
best = i;
}
}
return best;
}
}

+ 145
- 0
core/src/main/java/com/hankcs/hanlp/classification/classifiers/IClassifier.java View File

@@ -0,0 +1,145 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>2016/1/29 17:59</create-date>
*
* <copyright file="ITextClassifier.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.classifiers;

import com.hankcs.hanlp.classification.corpus.Document;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.models.AbstractModel;

import java.io.IOException;
import java.util.Map;

/**
* 文本分类器接口
*
* @author hankcs
*/
public interface IClassifier
{
/**
* 是否归一化分值为概率
*
* @param enable
* @return
*/
IClassifier enableProbability(boolean enable);

/**
* 预测分类
*
* @param text 文本
* @return 所有分类对应的分值(或概率, 需要enableProbability)
* @throws IllegalArgumentException 参数错误
* @throws IllegalStateException 未训练模型
*/
Map<String, Double> predict(String text) throws IllegalArgumentException, IllegalStateException;

/**
* 预测分类
* @param document
* @return
*/
Map<String, Double> predict(Document document) throws IllegalArgumentException, IllegalStateException;

/**
* 预测分类
* @param document
* @return
* @throws IllegalArgumentException
* @throws IllegalStateException
*/
double[] categorize(Document document) throws IllegalArgumentException, IllegalStateException;

/**
* 预测最可能的分类
* @param document
* @return
* @throws IllegalArgumentException
* @throws IllegalStateException
*/
int label(Document document) throws IllegalArgumentException, IllegalStateException;

/**
* 预测最可能的分类
* @param text 文本
* @return 最可能的分类
* @throws IllegalArgumentException
* @throws IllegalStateException
*/
String classify(String text) throws IllegalArgumentException, IllegalStateException;

/**
* 预测最可能的分类
* @param document 一个结构化的文档(注意!这是一个底层数据结构,请谨慎操作)
* @return 最可能的分类
* @throws IllegalArgumentException
* @throws IllegalStateException
*/
String classify(Document document) throws IllegalArgumentException, IllegalStateException;

/**
* 训练模型
*
* @param trainingDataSet 训练数据集,用Map储存.键是分类名,值是一个数组,数组中每个元素都是一篇文档的内容.
*/
void train(Map<String, String[]> trainingDataSet) throws IllegalArgumentException;

/**
* 训练模型
*
* @param folderPath 分类语料的根目录.目录必须满足如下结构:<br>
* 根目录<br>
* ├── 分类A<br>
* │ └── 1.txt<br>
* │ └── 2.txt<br>
* │ └── 3.txt<br>
* ├── 分类B<br>
* │ └── 1.txt<br>
* │ └── ...<br>
* └── ...<br>
* 文件不一定需要用数字命名,也不需要以txt作为后缀名,但一定需要是文本文件.
* @param charsetName 文件编码
* @throws IOException 任何可能的IO异常
*/
void train(String folderPath, String charsetName) throws IOException;

/**
* 用UTF-8编码的语料训练模型
*
* @param folderPath 用UTF-8编码的分类语料的根目录.目录必须满足如下结构:<br>
* 根目录<br>
* ├── 分类A<br>
* │ └── 1.txt<br>
* │ └── 2.txt<br>
* │ └── 3.txt<br>
* ├── 分类B<br>
* │ └── 1.txt<br>
* │ └── ...<br>
* └── ...<br>
* 文件不一定需要用数字命名,也不需要以txt作为后缀名,但一定需要是文本文件.
* @throws IOException 任何可能的IO异常
*/
void train(String folderPath) throws IOException;

/**
* 训练模型
* @param dataSet 训练数据集
* @throws IllegalArgumentException 当数据集为空时,将抛出此异常
*/
void train(IDataSet dataSet) throws IllegalArgumentException;

/**
* 获取训练后的模型,可用于序列化保存或预测.
* @return 模型,null表示未训练
*/
AbstractModel getModel();
}

+ 205
- 0
core/src/main/java/com/hankcs/hanlp/classification/classifiers/NaiveBayesClassifier.java View File

@@ -0,0 +1,205 @@
package com.hankcs.hanlp.classification.classifiers;

import com.hankcs.hanlp.utility.MathUtility;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.classification.corpus.*;
import com.hankcs.hanlp.classification.features.ChiSquareFeatureExtractor;
import com.hankcs.hanlp.classification.features.BaseFeatureData;
import com.hankcs.hanlp.classification.models.AbstractModel;
import com.hankcs.hanlp.classification.models.NaiveBayesModel;

import static com.hankcs.hanlp.classification.utilities.io.ConsoleLogger.logger;

import java.util.*;

/**
* 实现一个基于多项式贝叶斯模型的文本分类器
*/
public class NaiveBayesClassifier extends AbstractClassifier
{

private NaiveBayesModel model;

/**
* 由训练结果构造一个贝叶斯分类器,通常用来加载磁盘中的分类器
*
* @param naiveBayesModel
*/
public NaiveBayesClassifier(NaiveBayesModel naiveBayesModel)
{
this.model = naiveBayesModel;
}

/**
* 构造一个空白的贝叶斯分类器,通常准备用来进行训练
*/
public NaiveBayesClassifier()
{
this(null);
}

/**
* 获取训练结果
*
* @return
*/
public NaiveBayesModel getNaiveBayesModel()
{
return model;
}

public void train(IDataSet dataSet)
{
logger.out("原始数据集大小:%d\n", dataSet.size());
//选择最佳特征
BaseFeatureData featureData = selectFeatures(dataSet);

//初始化分类器所用的数据
model = new NaiveBayesModel();
model.n = featureData.n; //样本数量
model.d = featureData.featureCategoryJointCount.length; //特征数量

model.c = featureData.categoryCounts.length; //类目数量
model.logPriors = new TreeMap<Integer, Double>();

int sumCategory;
for (int category = 0; category < featureData.categoryCounts.length; category++)
{
sumCategory = featureData.categoryCounts[category];
model.logPriors.put(category, Math.log((double) sumCategory / model.n));
}

//拉普拉斯平滑处理(又称加一平滑),这时需要估计每个类目下的实例
Map<Integer, Double> featureOccurrencesInCategory = new TreeMap<Integer, Double>();

Double featureOccSum;
for (Integer category : model.logPriors.keySet())
{
featureOccSum = 0.0;
for (int feature = 0; feature < featureData.featureCategoryJointCount.length; feature++)
{

featureOccSum += featureData.featureCategoryJointCount[feature][category];
}
featureOccurrencesInCategory.put(category, featureOccSum);
}

//对数似然估计
int count;
int[] featureCategoryCounts;
double logLikelihood;
for (Integer category : model.logPriors.keySet())
{
for (int feature = 0; feature < featureData.featureCategoryJointCount.length; feature++)
{

featureCategoryCounts = featureData.featureCategoryJointCount[feature];

count = featureCategoryCounts[category];

logLikelihood = Math.log((count + 1.0) / (featureOccurrencesInCategory.get(category) + model.d));
if (!model.logLikelihoods.containsKey(feature))
{
model.logLikelihoods.put(feature, new TreeMap<Integer, Double>());
}
model.logLikelihoods.get(feature).put(category, logLikelihood);
}
}
logger.out("贝叶斯统计结束\n");
model.catalog = dataSet.getCatalog().toArray();
model.tokenizer = dataSet.getTokenizer();
model.wordIdTrie = featureData.wordIdTrie;
}

public AbstractModel getModel()
{
return model;
}

public Map<String, Double> predict(String text) throws IllegalArgumentException, IllegalStateException
{
if (model == null)
{
throw new IllegalStateException("未训练模型!无法执行预测!");
}
if (text == null)
{
throw new IllegalArgumentException("参数 text == null");
}

//分词,创建文档
Document doc = new Document(model.wordIdTrie, model.tokenizer.segment(text));

return predict(doc);
}

@Override
public double[] categorize(Document document) throws IllegalArgumentException, IllegalStateException
{
Integer category;
Integer feature;
Integer occurrences;
Double logprob;

double[] predictionScores = new double[model.catalog.length];
for (Map.Entry<Integer, Double> entry1 : model.logPriors.entrySet())
{
category = entry1.getKey();
logprob = entry1.getValue(); //用类目的对数似然初始化概率

//对文档中的每个特征
for (Map.Entry<Integer, int[]> entry2 : document.tfMap.entrySet())
{
feature = entry2.getKey();

if (!model.logLikelihoods.containsKey(feature))
{
continue; //如果在模型中找不到就跳过了
}

occurrences = entry2.getValue()[0]; //获取其在文档中的频次

logprob += occurrences * model.logLikelihoods.get(feature).get(category); //将对数似然乘上频次
}
predictionScores[category] = logprob;
}

if (configProbabilityEnabled) MathUtility.normalizeExp(predictionScores);
return predictionScores;
}

/**
* 统计特征并且执行特征选择,返回一个FeatureStats对象,用于计算模型中的概率
*
* @param dataSet
* @return
*/
protected BaseFeatureData selectFeatures(IDataSet dataSet)
{
ChiSquareFeatureExtractor chiSquareFeatureExtractor = new ChiSquareFeatureExtractor();

logger.start("使用卡方检测选择特征中...");
//FeatureStats对象包含文档中所有特征及其统计信息
BaseFeatureData featureData = chiSquareFeatureExtractor.extractBasicFeatureData(dataSet); //执行统计

//我们传入这些统计信息到特征选择算法中,得到特征与其分值
Map<Integer, Double> selectedFeatures = chiSquareFeatureExtractor.chi_square(featureData);

//从统计数据中删掉无用的特征并重建特征映射表
int[][] featureCategoryJointCount = new int[selectedFeatures.size()][];
featureData.wordIdTrie = new BinTrie<Integer>();
String[] wordIdArray = dataSet.getLexicon().getWordIdArray();
int p = -1;
for (Integer feature : selectedFeatures.keySet())
{
featureCategoryJointCount[++p] = featureData.featureCategoryJointCount[feature];
featureData.wordIdTrie.put(wordIdArray[feature], p);
}
logger.finish(",选中特征数:%d / %d = %.2f%%\n", featureCategoryJointCount.length,
featureData.featureCategoryJointCount.length,
featureCategoryJointCount.length / (double)featureData.featureCategoryJointCount.length * 100.);
featureData.featureCategoryJointCount = featureCategoryJointCount;

return featureData;
}
}

+ 39
- 0
core/src/main/java/com/hankcs/hanlp/classification/collections/FrequencyMap.java View File

@@ -0,0 +1,39 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/13 PM3:48</create-date>
*
* <copyright file="FrequencyMap.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.collections;

import java.util.TreeMap;

/**
* 统计词频的Map
* @author hankcs
*/
public class FrequencyMap<K> extends TreeMap<K, int[]>
{
/**
* 增加一个词的词频
* @param key
* @return
*/
public int add(K key)
{
int[] f = get(key);
if (f == null)
{
f = new int[]{1};
put(key, f);
}
else ++f[0];

return f[0];
}
}

+ 170
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/AbstractDataSet.java View File

@@ -0,0 +1,170 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM5:43</create-date>
*
* <copyright file="AbstractDataSet.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;

import com.hankcs.hanlp.classification.models.AbstractModel;
import com.hankcs.hanlp.classification.tokenizers.HanLPTokenizer;
import com.hankcs.hanlp.classification.tokenizers.ITokenizer;
import com.hankcs.hanlp.utility.MathUtility;
import com.hankcs.hanlp.classification.utilities.TextProcessUtility;

import java.io.File;
import java.io.IOException;
import java.util.Map;

import static com.hankcs.hanlp.classification.utilities.io.ConsoleLogger.logger;

/**
* @author hankcs
*/
public abstract class AbstractDataSet implements IDataSet
{
protected ITokenizer tokenizer;
protected Catalog catalog;
protected Lexicon lexicon;
/**
* 是否属于测试集
*/
protected boolean testingDataSet;

/**
* 构造测试集
* @param model 待测试的模型
*/
public AbstractDataSet(AbstractModel model)
{
lexicon = new Lexicon(model.wordIdTrie);
tokenizer = model.tokenizer;
catalog = new Catalog(model.catalog);
testingDataSet = true;
}

public AbstractDataSet()
{
tokenizer = new HanLPTokenizer();
catalog = new Catalog();
lexicon = new Lexicon();
}

public IDataSet setTokenizer(ITokenizer tokenizer)
{
this.tokenizer = tokenizer;
return this;
}

public Document convert(String category, String text)
{
String[] tokenArray = tokenizer.segment(text);
return testingDataSet ?
new Document(catalog.categoryId, lexicon.wordId, category, tokenArray) :
new Document(catalog, lexicon, category, tokenArray);
}

public ITokenizer getTokenizer()
{
return tokenizer;
}

public Catalog getCatalog()
{
return catalog;
}

public Lexicon getLexicon()
{
return lexicon;
}

@Override
public IDataSet load(String folderPath, String charsetName) throws IllegalArgumentException, IOException
{
return load(folderPath, charsetName, 1.);
}

@Override
public IDataSet load(String folderPath) throws IllegalArgumentException, IOException
{
return load(folderPath, "UTF-8");
}

@Override
public boolean isTestingDataSet()
{
return testingDataSet;
}

@Override
public IDataSet load(String folderPath, String charsetName, double percentage) throws IllegalArgumentException, IOException
{
if (folderPath == null) throw new IllegalArgumentException("参数 folderPath == null");
File root = new File(folderPath);
if (!root.exists()) throw new IllegalArgumentException(String.format("目录 %s 不存在", root.getAbsolutePath()));
if (!root.isDirectory())
throw new IllegalArgumentException(String.format("目录 %s 不是一个目录", root.getAbsolutePath()));
if (percentage > 1.0 || percentage < -1.0) throw new IllegalArgumentException("percentage 的绝对值必须介于[0, 1]之间");

File[] folders = root.listFiles();
if (folders == null) return null;
logger.start("模式:%s\n文本编码:%s\n根目录:%s\n加载中...\n", testingDataSet ? "测试集" : "训练集", charsetName, folderPath);
for (File folder : folders)
{
if (folder.isFile()) continue;
File[] files = folder.listFiles();
if (files == null) continue;
String category = folder.getName();
logger.out("[%s]...", category);
int b, e;
if (percentage > 0)
{
b = 0;
e = (int) (files.length * percentage);
}
else
{
b = (int) (files.length * (1 + percentage));
e = files.length;
}

int logEvery = (int) Math.ceil((e - b) / 10000f);
for (int i = b; i < e; i++)
{
add(folder.getName(), TextProcessUtility.readTxt(files[i], charsetName));
if (i % logEvery == 0)
{
logger.out("%c[%s]...%.2f%%", 13, category, MathUtility.percentage(i - b + 1, e - b));
}
}
logger.out(" %d 篇文档\n", e - b);
}
logger.finish(" 加载了 %d 个类目,共 %d 篇文档\n", getCatalog().size(), size());
return this;
}

@Override
public IDataSet load(String folderPath, double rate) throws IllegalArgumentException, IOException
{
return null;
}

@Override
public IDataSet add(Map<String, String[]> testingDataSet)
{
for (Map.Entry<String, String[]> entry : testingDataSet.entrySet())
{
for (String document : entry.getValue())
{
add(entry.getKey(), document);
}
}
return this;
}
}

+ 50
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/BagOfWordsDocument.java View File

@@ -0,0 +1,50 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/13 PM9:12</create-date>
*
* <copyright file="BagOfWordsDocument.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;

import com.hankcs.hanlp.classification.collections.FrequencyMap;

/**
* @author hankcs
*/
public class BagOfWordsDocument implements ITermFrequencyHolder
{
//
// /**
// * 文档所属的词表
// */
// private Lexicon lexicon;
// /**
// * 文档所属的类表
// */
// private Catalog catalog;
public FrequencyMap<Integer> tfMap;

public BagOfWordsDocument()
{
tfMap = new FrequencyMap<Integer>();
}

public FrequencyMap<Integer> getTfMap()
{
return tfMap;
}

/**
* 是否为空(文档中没有任何词)
* @return
*/
public boolean isEmpty()
{
return tfMap.isEmpty();
}
}

+ 95
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/Catalog.java View File

@@ -0,0 +1,95 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM4:56</create-date>
*
* <copyright file="Catalog.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

/**
* 类目名称和id的对应关系
* @author hankcs
*/
public class Catalog implements Serializable
{
Map<String, Integer> categoryId;
List<String> idCategory;

public Catalog()
{
categoryId = new TreeMap<String, Integer>();
idCategory = new ArrayList<String>();
}

public Catalog(String[] catalog)
{
this();
for (int i = 0; i < catalog.length; i++)
{
categoryId.put(catalog[i], i);
idCategory.add(catalog[i]);
}
}

public int addCategory(String category)
{
Integer id = categoryId.get(category);
if (id == null)
{
id = categoryId.size();
categoryId.put(category, id);
assert idCategory.size() == id;
idCategory.add(category);
}

return id;
}

public Integer getId(String category)
{
return categoryId.get(category);
}

public String getCategory(int id)
{
assert 0 <= id;
assert id < idCategory.size();

return idCategory.get(id);
}

public List<String> getCategories()
{
return idCategory;
}

public int size()
{
return idCategory.size();
}

public String[] toArray()
{
String[] catalog = new String[idCategory.size()];
idCategory.toArray(catalog);

return catalog;
}

@Override
public String toString()
{
return idCategory.toString();
}
}

+ 117
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/Document.java View File

@@ -0,0 +1,117 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM4:23</create-date>
*
* <copyright file="Document.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;

import com.hankcs.hanlp.collection.trie.ITrie;
import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.classification.collections.FrequencyMap;

import java.io.DataInputStream;
import java.io.IOException;
import java.util.Map;

/**
* 单个文档,或者说一个词袋,包含所有的词以及其频次,当然,还包括其分类。
*
* @author hankcs
*/
public class Document extends BagOfWordsDocument
{
/**
* 文档所属类目
*/
public int category;

/**
* 一般用在训练集构造文档时
* @param catalog
* @param lexicon
* @param category
* @param tokenArray
*/
public Document(Catalog catalog, Lexicon lexicon, String category, String[] tokenArray)
{
super();
assert catalog != null;
assert lexicon != null;
// this.catalog = catalog;
// this.lexicon = lexicon;

// 将其转为数组类型,方便处理
this.category = catalog.addCategory(category);
// 统计词频
for (int i = 0; i < tokenArray.length; i++)
{
tfMap.add(lexicon.addWord(tokenArray[i]));
}
}

/**
* 一般用在预测时构造文档用
* @param wordIdTrie
* @param tokenArray
*/
public Document(ITrie<Integer> wordIdTrie, String[] tokenArray)
{
super();
for (int i = 0; i < tokenArray.length; i++)
{
Integer id = wordIdTrie.get(tokenArray[i].toCharArray());
if (id == null) continue;
tfMap.add(id);
}
}

/**
* 一般用在测试集构造文档时使用
* @param categoryId
* @param wordId
* @param category
* @param tokenArray
*/
public Document(Map<String, Integer> categoryId, BinTrie<Integer> wordId, String category, String[] tokenArray)
{
this(wordId, tokenArray);
Integer id = categoryId.get(category);
if (id == null) id = -1;
this.category = id;
}

public Document(DataInputStream in) throws IOException
{
category = in.readInt();
int size = in.readInt();
tfMap = new FrequencyMap<Integer>();
for (int i = 0; i < size; i++)
{
tfMap.put(in.readInt(), new int[]{in.readInt()});
}
}

// @Override
// public String toString()
// {
// final StringBuilder sb = new StringBuilder(tfMap.size() * 5);
// sb.append('《').append(super.toString()).append('》').append('\t');
// sb.append(catalog.getCategory(category));
// sb.append('\n');
// for (Map.Entry<Integer, int[]> entry : tfMap.entrySet())
// {
// sb.append(lexicon.getWord(entry.getKey()));
// sb.append('\t');
// sb.append(entry.getValue()[0]);
// sb.append('\n');
// }
// return sb.toString();
// }

}

+ 184
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/FileDataSet.java View File

@@ -0,0 +1,184 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/20 PM4:42</create-date>
*
* <copyright file="FileDataSet.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;

import com.hankcs.hanlp.classification.collections.FrequencyMap;
import com.hankcs.hanlp.classification.models.AbstractModel;

import java.io.*;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
* @author hankcs
*/
public class FileDataSet extends AbstractDataSet
{
File cache;
DataOutputStream out;
int size;

public FileDataSet(AbstractModel model, File cache) throws FileNotFoundException
{
super(model);
initCache(cache);
}

public FileDataSet(AbstractModel model) throws IOException
{
this(model, File.createTempFile(String.valueOf(System.currentTimeMillis()), ".dat"));
}

public FileDataSet(File cache) throws FileNotFoundException
{
initCache(cache);
}

private void initCache(File cache) throws FileNotFoundException
{
this.cache = cache;
out = new DataOutputStream(new FileOutputStream(cache));
}

private void initCache() throws IOException
{
initCache(File.createTempFile(String.valueOf(System.currentTimeMillis()), ".dat"));
}

public FileDataSet() throws IOException
{
this(File.createTempFile(String.valueOf(System.currentTimeMillis()), ".dat"));
}

@Override
public Document add(String category, String text)
{
Document document = convert(category, text);
try
{
add(document);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
return document;
}

private void add(Document document) throws IOException
{
out.writeInt(document.category);
Set<Map.Entry<Integer, int[]>> entrySet = document.tfMap.entrySet();
out.writeInt(entrySet.size());
for (Map.Entry<Integer, int[]> entry : entrySet)
{
out.writeInt(entry.getKey());
out.writeInt(entry.getValue()[0]);
}
++size;
}

@Override
public int size()
{
return size;
}

@Override
public void clear()
{
size = 0;
}

@Override
public IDataSet shrink(int[] idMap)
{
try
{
clear();
Iterator<Document> iterator = iterator();
initCache();
while (iterator.hasNext())
{
Document document = iterator.next();
FrequencyMap<Integer> tfMap = new FrequencyMap<Integer>();
for (Map.Entry<Integer, int[]> entry : document.tfMap.entrySet())
{
Integer feature = entry.getKey();
if (idMap[feature] == -1) continue;
tfMap.put(idMap[feature], entry.getValue());
}
// 检查是否是空白文档
if (tfMap.size() == 0) continue;
document.tfMap = tfMap;
add(document);
}
}
catch (IOException e)
{
throw new RuntimeException(e);
}

return this;
}

@Override
public Iterator<Document> iterator()
{
try
{
out.close();
final DataInputStream in = new DataInputStream(new FileInputStream(cache));
return new Iterator<Document>()
{
@Override
public void remove()
{
throw new RuntimeException("不支持的操作");
}

@Override
public boolean hasNext()
{
try
{
boolean next = in.available() > 0;
if (!next) in.close();
return next;
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}

@Override
public Document next()
{
try
{
return new Document(in);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
};
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
}

+ 146
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/IDataSet.java View File

@@ -0,0 +1,146 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM4:21</create-date>
*
* <copyright file="IClassificationCorpus.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;

import com.hankcs.hanlp.classification.tokenizers.ITokenizer;

import java.io.IOException;
import java.util.Map;

/**
* 文本分类数据集接口
*
* @author hankcs
*/
public interface IDataSet extends Iterable<Document>
{
/**
* 加载数据集
*
* @param folderPath 分类语料的根目录.目录必须满足如下结构:<br>
* 根目录<br>
* ├── 分类A<br>
* │ └── 1.txt<br>
* │ └── 2.txt<br>
* │ └── 3.txt<br>
* ├── 分类B<br>
* │ └── 1.txt<br>
* │ └── ...<br>
* └── ...<br>
* 文件不一定需要用数字命名,也不需要以txt作为后缀名,但一定需要是文本文件.
* @return
* @throws IllegalArgumentException
* @throws IOException
*/
IDataSet load(String folderPath) throws IllegalArgumentException, IOException;
IDataSet load(String folderPath, double rate) throws IllegalArgumentException, IOException;

/**
* 加载数据集
*
* @param folderPath 分类语料的根目录.目录必须满足如下结构:<br>
* 根目录<br>
* ├── 分类A<br>
* │ └── 1.txt<br>
* │ └── 2.txt<br>
* │ └── 3.txt<br>
* ├── 分类B<br>
* │ └── 1.txt<br>
* │ └── ...<br>
* └── ...<br>
* 文件不一定需要用数字命名,也不需要以txt作为后缀名,但一定需要是文本文件.
* @param charsetName 文件编码
* @return
* @throws IllegalArgumentException
* @throws IOException
*/
IDataSet load(String folderPath, String charsetName) throws IllegalArgumentException, IOException;
IDataSet load(String folderPath, String charsetName, double percentage) throws IllegalArgumentException, IOException;

/**
* 往训练集中加入一个文档
*
* @param category 分类
* @param text 文本
* @return
*/
Document add(String category, String text);

/**
* 利用本数据集的词表和类目表将文本形式的文档转换为内部通用的文档
*
* @param category
* @param text
* @return
*/
Document convert(String category, String text);

/**
* 设置分词器
*
* @param tokenizer
* @return
*/
IDataSet setTokenizer(ITokenizer tokenizer);

/**
* 数据集的样本大小
*
* @return
*/
int size();

/**
* 获取分词器
*
* @return
*/
ITokenizer getTokenizer();

/**
* 获取类目表
*
* @return
*/
Catalog getCatalog();

/**
* 获取词表
*
* @return
*/
Lexicon getLexicon();

/**
* 清空数据集
*/
void clear();

/**
* 是否是测试集
*
* @return
*/
boolean isTestingDataSet();

IDataSet add(Map<String, String[]> testingDataSet);

IDataSet shrink(int[] idMap);

// /**
// * 分割数据集
// * @param rate 得到新数据集占原数据集的大小比率,比如原本有10个文档,rate=0.1,则新数据集有1个文档,旧数据集变成了9个文档
// * @return 新数据集
// * @throws IllegalArgumentException
// */
// IDataSet spilt(double rate) throws IllegalArgumentException;
}

+ 22
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/ITermFrequencyHolder.java View File

@@ -0,0 +1,22 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/13 PM9:08</create-date>
*
* <copyright file="ITermFrequencyHolder.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;

import com.hankcs.hanlp.classification.collections.FrequencyMap;

/**
* @author hankcs
*/
public interface ITermFrequencyHolder
{
FrequencyMap<Integer> getTfMap();
}

+ 88
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/Lexicon.java View File

@@ -0,0 +1,88 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM4:24</create-date>
*
* <copyright file="Lexicon.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;


import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
* 词表
*
* @author hankcs
*/
public class Lexicon
{
public BinTrie<Integer> wordId;
public List<String> idWord;

public Lexicon()
{
wordId = new BinTrie<Integer>();
idWord = new LinkedList<String>();
}

public Lexicon(BinTrie<Integer> wordIdTrie)
{
wordId = wordIdTrie;
}

public int addWord(String word)
{
assert word != null;
char[] charArray = word.toCharArray();
Integer id = wordId.get(charArray);
if (id == null)
{
id = wordId.size();
wordId.put(charArray, id);
idWord.add(word);
assert idWord.size() == wordId.size();
}

return id;
}

public Integer getId(String word)
{
return wordId.get(word);
}

public String getWord(int id)
{
assert 0 <= id;
assert id <= idWord.size();
return idWord.get(id);
}

public int size()
{
return idWord.size();
}

public String[] getWordIdArray()
{
String[] wordIdArray = new String[idWord.size()];
if (idWord.isEmpty()) return wordIdArray;
int p = -1;
Iterator<String> iterator = idWord.iterator();
while (iterator.hasNext())
{
wordIdArray[++p] = iterator.next();
}

return wordIdArray;
}
}

+ 89
- 0
core/src/main/java/com/hankcs/hanlp/classification/corpus/MemoryDataSet.java View File

@@ -0,0 +1,89 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM5:29</create-date>
*
* <copyright file="MemoryDataSet.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.corpus;


import com.hankcs.hanlp.classification.collections.FrequencyMap;
import com.hankcs.hanlp.classification.models.AbstractModel;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/**
* @author hankcs
*/
public class MemoryDataSet extends AbstractDataSet
{
List<Document> documentList;
boolean editMode;

public MemoryDataSet()
{
super();
documentList = new LinkedList<Document>();
}

public MemoryDataSet(AbstractModel model)
{
super(model);
documentList = new LinkedList<Document>();
}

@Override
public Document add(String category, String text)
{
if (editMode) return null;
Document document = convert(category, text);
documentList.add(document);
return document;
}
@Override
public int size()
{
return documentList.size();
}

@Override
public void clear()
{
documentList.clear();
}

@Override
public IDataSet shrink(int[] idMap)
{
Iterator<Document> iterator = iterator();
while (iterator.hasNext())
{
Document document = iterator.next();
FrequencyMap<Integer> tfMap = new FrequencyMap<Integer>();
for (Map.Entry<Integer, int[]> entry : document.tfMap.entrySet())
{
Integer feature = entry.getKey();
if (idMap[feature] == -1) continue;
tfMap.put(idMap[feature], entry.getValue());
}
// 检查是否是空白文档
if (tfMap.size() == 0) iterator.remove();
else document.tfMap = tfMap;
}
return this;
}

@Override
public Iterator<Document> iterator()
{
return documentList.iterator();
}
}

+ 58
- 0
core/src/main/java/com/hankcs/hanlp/classification/features/BaseFeatureData.java View File

@@ -0,0 +1,58 @@
package com.hankcs.hanlp.classification.features;

import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.classification.corpus.Catalog;
import com.hankcs.hanlp.classification.corpus.Document;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.corpus.Lexicon;

import java.util.Map;

/**
* 储存所有必需的统计数据,尽量不要存太多东西在这里,因为多个分类器都用这个结构,所以里面的数据仅保留必需的数据
*/
public class BaseFeatureData
{
/**
* 样本数量
*/
public int n;

/**
* 一个特征在类目中分别出现几次(键是特征,值的键是类目)
*/
public int[][] featureCategoryJointCount;

/**
* 每个类目中的文档数量
*/
public int[] categoryCounts;

/**
* 新的特征映射表
*/
public BinTrie<Integer> wordIdTrie;

/**
* 构造一个空白的统计对象
*/
public BaseFeatureData(IDataSet dataSet)
{
Catalog catalog = dataSet.getCatalog();
Lexicon lexicon = dataSet.getLexicon();
n = dataSet.size();
featureCategoryJointCount = new int[lexicon.size()][catalog.size()];
categoryCounts = new int[catalog.size()];

// 执行统计
for (Document document : dataSet)
{
++categoryCounts[document.category];

for (Map.Entry<Integer, int[]> entry : document.tfMap.entrySet())
{
featureCategoryJointCount[entry.getKey()][document.category] += 1;
}
}
}
}

+ 148
- 0
core/src/main/java/com/hankcs/hanlp/classification/features/ChiSquareFeatureExtractor.java View File

@@ -0,0 +1,148 @@
package com.hankcs.hanlp.classification.features;

import com.hankcs.hanlp.algorithm.MaxHeap;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.statistics.ContinuousDistributions;

import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;

/**
* 特征提取,用来生成FeatureStats并且使用卡方测试选择最佳特征
*/
public class ChiSquareFeatureExtractor
{
/**
* 在P值(拒真错误概率)为0.001时的卡方临界值,用于特征选择算法
*/
protected double chisquareCriticalValue = 10.83;

protected int maxSize = 1000000;

/**
* 生成一个FeatureStats对象,包含一个分类中的所有词语,分类数,实例数。这些统计数据
* 将用于特征选择算法。
*
* @param dataSet
* @return
*/
public static BaseFeatureData extractBasicFeatureData(IDataSet dataSet)
{
BaseFeatureData stats = new BaseFeatureData(dataSet);
return stats;
}

/**
* 使用卡方非参数校验来执行特征选择<br>
* https://nlp.stanford.edu/IR-book/html/htmledition/feature-selectionchi2-feature-selection-1.html
*
* @param stats
* @return
*/
public Map<Integer, Double> chi_square(BaseFeatureData stats)
{
Map<Integer, Double> selectedFeatures = new HashMap<Integer, Double>();

double N1dot, N0dot, N00, N01, N10, N11;
double chisquareScore;
Double previousScore;
for (int feature = 0; feature < stats.featureCategoryJointCount.length; feature++)
{
int[] categoryList = stats.featureCategoryJointCount[feature];

//计算 N1. (含有该特征的文档数量)
N1dot = 0;
for (int count : categoryList)
{
N1dot += count;
}

//还有 N0. (不含该特征的文档数量)
N0dot = stats.n - N1dot;

for (int category = 0; category < categoryList.length; category++)
{

N11 = categoryList[category]; //N11 是含有该特征并属于该类目的文档数量
N01 = stats.categoryCounts[category] - N11; //N01 是不含该特征却属于该类目的文档数量

N00 = N0dot - N01; //N00 是不含该特征也不属于该类目的文档数量
N10 = N1dot - N11; //N10 是含有该特征却不属于该类目的文档数量

//基于上述统计数据计算卡方分值
chisquareScore = stats.n * Math.pow(N11 * N00 - N10 * N01, 2) / ((N11 + N01) * (N11 + N10) * (N10 + N00) * (N01 + N00));

//如果分数大于临界值则加入特征列表
if (chisquareScore >= chisquareCriticalValue)
{
previousScore = selectedFeatures.get(feature);
if (previousScore == null || chisquareScore > previousScore)
{
selectedFeatures.put(feature, chisquareScore);
}
}
}
}
if (selectedFeatures.size() == 0) // 当特征全部无法通过卡方检测时,取全集作为特征
{
for (int feature = 0; feature < stats.featureCategoryJointCount.length; feature++)
{
selectedFeatures.put(feature, 0.);
}
}
if (selectedFeatures.size() > maxSize)
{
MaxHeap<Map.Entry<Integer, Double>> maxHeap = new MaxHeap<Map.Entry<Integer, Double>>(maxSize, new Comparator<Map.Entry<Integer, Double>>()
{
@Override
public int compare(Map.Entry<Integer, Double> o1, Map.Entry<Integer, Double> o2)
{
return o1.getValue().compareTo(o2.getValue());
}
});
for (Map.Entry<Integer, Double> entry : selectedFeatures.entrySet())
{
maxHeap.add(entry);
}
selectedFeatures.clear();
for (Map.Entry<Integer, Double> entry : maxHeap)
{
selectedFeatures.put(entry.getKey(), entry.getValue());
}
}

return selectedFeatures;
}

/**
* 获取卡方临界值
*
* @return
*/
public double getChisquareCriticalValue()
{
return chisquareCriticalValue;
}

/**
* 设置卡方临界值
*
* @param chisquareCriticalValue
*/
public void setChisquareCriticalValue(double chisquareCriticalValue)
{
this.chisquareCriticalValue = chisquareCriticalValue;
}

public ChiSquareFeatureExtractor setALevel(double aLevel)
{
chisquareCriticalValue = ContinuousDistributions.ChisquareInverseCdf(aLevel, 1);
return this;
}

public double getALevel()
{
return ContinuousDistributions.ChisquareCdf(chisquareCriticalValue, 1);
}
}

+ 32
- 0
core/src/main/java/com/hankcs/hanlp/classification/features/DfFeatureData.java View File

@@ -0,0 +1,32 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/16 AM10:37</create-date>
*
* <copyright file="DfFeatureData.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.features;

import com.hankcs.hanlp.classification.corpus.IDataSet;

/**
* 包含倒排文档频次的特征数据
* @author hankcs
*/
public class DfFeatureData extends BaseFeatureData
{
public int[] df;
/**
* 构造一个空白的统计对象
*
* @param dataSet
*/
public DfFeatureData(IDataSet dataSet)
{
super(dataSet);
}
}

+ 17
- 0
core/src/main/java/com/hankcs/hanlp/classification/features/IFeatureWeighter.java View File

@@ -0,0 +1,17 @@
package com.hankcs.hanlp.classification.features;

import java.io.Serializable;

/**
* 词权重计算
*/
public interface IFeatureWeighter extends Serializable
{
/**
* 计算权重
*
* @param feature 词的id
* @return 权重
*/
double weight(int feature, int tf);
}

+ 22
- 0
core/src/main/java/com/hankcs/hanlp/classification/features/TfIdfFeatureWeighter.java View File

@@ -0,0 +1,22 @@
package com.hankcs.hanlp.classification.features;

/**
* TF-IDF权重计算
*/
public class TfIdfFeatureWeighter implements IFeatureWeighter
{
int numDocs;
int df[];

public TfIdfFeatureWeighter(int numDocs, int[] df)
{
this.numDocs = numDocs;
this.df = df;
}

public double weight(int feature, int tf)
{
if (feature >= df.length) System.err.println(feature);
return Math.log10(tf + 1) * (Math.log10((double) numDocs / df[feature] + 1)); // 一种改进的tf*idf计算方式;
}
}

+ 12
- 0
core/src/main/java/com/hankcs/hanlp/classification/features/TfOnlyFeatureWeighter.java View File

@@ -0,0 +1,12 @@
package com.hankcs.hanlp.classification.features;

/**
* 仅仅使用TF的权重计算方式
*/
public class TfOnlyFeatureWeighter implements IFeatureWeighter
{
public double weight(int feature, int tf)
{
return tf;
}
}

+ 37
- 0
core/src/main/java/com/hankcs/hanlp/classification/models/AbstractModel.java View File

@@ -0,0 +1,37 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/13 PM8:43</create-date>
*
* <copyright file="IModel.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.models;

import com.hankcs.hanlp.collection.trie.bintrie.BinTrie;
import com.hankcs.hanlp.classification.tokenizers.ITokenizer;

import java.io.*;

/**
* 所有文本分类模型的基类,包含基本的需要序列化的数据
* @author hankcs
*/
public class AbstractModel implements Serializable
{
/**
* 类目表
*/
public String[] catalog;
/**
* 分词器
*/
public ITokenizer tokenizer;
/**
* 词语到的映射
*/
public BinTrie<Integer> wordIdTrie;
}

+ 34
- 0
core/src/main/java/com/hankcs/hanlp/classification/models/NaiveBayesModel.java View File

@@ -0,0 +1,34 @@
package com.hankcs.hanlp.classification.models;

import java.util.HashMap;
import java.util.Map;

/**
* 储存学习过程中的数据
*/
public class NaiveBayesModel extends AbstractModel
{

/**
* 先验概率的对数值 log( P(c) )
*/
public Map<Integer, Double> logPriors = new HashMap<Integer, Double>();

/**
* 似然对数值 log( P(x|c) )
*/
public Map<Integer, Map<Integer, Double>> logLikelihoods = new HashMap<Integer, Map<Integer, Double>>();

/**
* 训练样本数
*/
public int n = 0;
/**
* 类别数
*/
public int c = 0;
/**
* 特征数
*/
public int d = 0;
}

+ 257
- 0
core/src/main/java/com/hankcs/hanlp/classification/statistics/ContinuousDistributions.java View File

@@ -0,0 +1,257 @@
/**
* Copyright (C) 2013-2017 Vasilis Vryniotis <bbriniotis@datumbox.com>
* <p>
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hankcs.hanlp.classification.statistics;

/**
* 提供常见连续分布的概率密度函数和累积分布函数计算<br>
*
* @author Vasilis Vryniotis <bbriniotis@datumbox.com>
*/
public class ContinuousDistributions
{

/**
* 给定卡方值和自由度,计算从0到x的累积分布函数值<br>
*
* @param x 卡方值
* @param df 自由度
* @return 从0到x的累积分布函数值
* @throws IllegalArgumentException
*/
public static double ChisquareCdf(double x, int df) throws IllegalArgumentException
{
if (df <= 0)
{
throw new IllegalArgumentException();
}

return GammaCdf(x / 2.0, df / 2.0);
}

/**
* 给定高斯函数的z值,返回p值(累积分布函数值)<br>
* http://jamesmccaffrey.wordpress.com/2010/11/05/programmatically-computing-the-area-under-the-normal-curve/
*
* @param z 从负无穷到正无穷的值
* @return 高斯函数累积分布函数值
*/
public static double GaussCdf(double z)
{
// input = z-value (-inf to +inf)
// output = p under Normal curve from -inf to z
// e.g., if z = 0.0, function returns 0.5000
// ACM Algorithm #209
double y; // 209 scratch variable
double p; // result. called ‘z’ in 209
double w; // 209 scratch variable

if (z == 0.0)
{
p = 0.0;
}
else
{
y = Math.abs(z) / 2.0;
if (y >= 3.0)
{
p = 1.0;
}
else if (y < 1.0)
{
w = y * y;
p = ((((((((0.000124818987 * w
- 0.001075204047) * w + 0.005198775019) * w
- 0.019198292004) * w + 0.059054035642) * w
- 0.151968751364) * w + 0.319152932694) * w
- 0.531923007300) * w + 0.797884560593) * y * 2.0;
}
else
{
y = y - 2.0;
p = (((((((((((((-0.000045255659 * y
+ 0.000152529290) * y - 0.000019538132) * y
- 0.000676904986) * y + 0.001390604284) * y
- 0.000794620820) * y - 0.002034254874) * y
+ 0.006549791214) * y - 0.010557625006) * y
+ 0.011630447319) * y - 0.009279453341) * y
+ 0.005353579108) * y - 0.002141268741) * y
+ 0.000535310849) * y + 0.999936657524;
}
}

if (z > 0.0)
{
return (p + 1.0) / 2.0;
}

return (1.0 - p) / 2.0;
}

/**
* Log Gamma Function
*
* @param Z
* @return
*/
public static double LogGamma(double Z)
{
double S = 1.0 + 76.18009173 / Z - 86.50532033 / (Z + 1.0) + 24.01409822 / (Z + 2.0) - 1.231739516 / (Z + 3.0) + 0.00120858003 / (Z + 4.0) - 0.00000536382 / (Z + 5.0);
double LG = (Z - 0.5) * Math.log(Z + 4.5) - (Z + 4.5) + Math.log(S * 2.50662827465);

return LG;
}

/**
* Internal function used by GammaCdf
*
* @param x
* @param A
* @return
*/
protected static double Gcf(double x, double A)
{
// Good for X>A+1
double A0 = 0;
double B0 = 1;
double A1 = 1;
double B1 = x;
double AOLD = 0;
double N = 0;
while (Math.abs((A1 - AOLD) / A1) > .00001)
{
AOLD = A1;
N = N + 1;
A0 = A1 + (N - A) * A0;
B0 = B1 + (N - A) * B0;
A1 = x * A0 + N * A1;
B1 = x * B0 + N * B1;
A0 = A0 / B1;
B0 = B0 / B1;
A1 = A1 / B1;
B1 = 1;
}
double Prob = Math.exp(A * Math.log(x) - x - LogGamma(A)) * A1;

return 1.0 - Prob;
}

/**
* Internal function used by GammaCdf
*
* @param x
* @param A
* @return
*/
protected static double Gser(double x, double A)
{
// Good for X<A+1.
double T9 = 1 / A;
double G = T9;
double I = 1;
while (T9 > G * 0.00001)
{
T9 = T9 * x / (A + I);
G = G + T9;
++I;
}
G = G * Math.exp(A * Math.log(x) - x - LogGamma(A));

return G;
}

/**
* 伽马函数
*
* @param x
* @param a
* @return
* @throws IllegalArgumentException
*/
protected static double GammaCdf(double x, double a) throws IllegalArgumentException
{
if (x < 0)
{
throw new IllegalArgumentException();
}

double GI = 0;
if (a > 200)
{
double z = (x - a) / Math.sqrt(a);
double y = GaussCdf(z);
double b1 = 2 / Math.sqrt(a);
double phiz = 0.39894228 * Math.exp(-z * z / 2);
double w = y - b1 * (z * z - 1) * phiz / 6; //Edgeworth1
double b2 = 6 / a;
int zXor4 = ((int) z) ^ 4;
double u = 3 * b2 * (z * z - 3) + b1 * b1 * (zXor4 - 10 * z * z + 15);
GI = w - phiz * z * u / 72; //Edgeworth2
}
else if (x < a + 1)
{
GI = Gser(x, a);
}
else
{
GI = Gcf(x, a);
}

return GI;
}

/**
* 给定卡方分布的p值和自由度,返回卡方值。内部采用二分搜索实现,移植自JS代码:
* http://www.fourmilab.ch/rpkp/experiments/analysis/chiCalc.js
*
* @param p p值(置信度)
* @param df
* @return
*/
public static double ChisquareInverseCdf(double p, int df)
{
final double CHI_EPSILON = 0.000001; /* Accuracy of critchi approximation */
final double CHI_MAX = 99999.0; /* Maximum chi-square value */
double minchisq = 0.0;
double maxchisq = CHI_MAX;
double chisqval = 0.0;

if (p <= 0.0)
{
return CHI_MAX;
}
else if (p >= 1.0)
{
return 0.0;
}

chisqval = df / Math.sqrt(p); /* fair first value */
while ((maxchisq - minchisq) > CHI_EPSILON)
{
if (1 - ChisquareCdf(chisqval, df) < p)
{
maxchisq = chisqval;
}
else
{
minchisq = chisqval;
}
chisqval = (maxchisq + minchisq) * 0.5;
}

return chisqval;
}
}

+ 110
- 0
core/src/main/java/com/hankcs/hanlp/classification/statistics/evaluations/Evaluator.java View File

@@ -0,0 +1,110 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/17 PM3:10</create-date>
*
* <copyright file="Evaluator.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.statistics.evaluations;

import com.hankcs.hanlp.classification.classifiers.IClassifier;
import com.hankcs.hanlp.classification.corpus.Document;
import com.hankcs.hanlp.classification.corpus.IDataSet;
import com.hankcs.hanlp.classification.corpus.MemoryDataSet;
import com.hankcs.hanlp.utility.MathUtility;

import java.util.Map;

/**
* 分类器性能评测
* @author hankcs
*/
public class Evaluator
{
private Evaluator()
{
}

public static FMeasure evaluate(IClassifier classifier, IDataSet testingDataSet)
{
int c = classifier.getModel().catalog.length;
double[] TP_FP = new double[c]; // 判定为某个类别的数量
double[] TP_FN = new double[c]; // 某个类别的样本数量
double[] TP = new double[c]; // 判定为某个类别且判断正确的数量
double time = System.currentTimeMillis();
for (Document document : testingDataSet)
{
final int out = classifier.label(document);
final int key = document.category;
++TP_FP[out];
++TP_FN[key];
if (key == out)
{
++TP[out];
}
}
time = System.currentTimeMillis() - time;

FMeasure result = calculate(c, testingDataSet.size(), TP, TP_FP, TP_FN);
result.catalog = testingDataSet.getCatalog().toArray();
result.speed = result.size / (time / 1000.);

return result;
}

public static FMeasure evaluate(IClassifier classifier, Map<String, String[]> testingDataSet)
{
return evaluate(classifier, new MemoryDataSet(classifier.getModel()).add(testingDataSet));
}

/**
*
* @param c 类目数量
* @param size 样本数量
* @param TP 判定为某个类别且判断正确的数量
* @param TP_FP 判定为某个类别的数量
* @param TP_FN 某个类别的样本数量
* @return
*/
private static FMeasure calculate(int c, int size, double[] TP, double[] TP_FP, double[] TP_FN)
{
double precision[] = new double[c];
double recall[] = new double[c];
double f1[] = new double[c];
double accuracy[] = new double[c];
FMeasure result = new FMeasure();
result.size = size;

for (int i = 0; i < c; i++)
{
double TN = result.size - TP_FP[i] - (TP_FN[i] - TP[i]);
accuracy[i] = (TP[i] + TN) / result.size;
if (TP[i] != 0)
{
precision[i] = TP[i] / TP_FP[i];
recall[i] = TP[i] / TP_FN[i];
result.average_accuracy += TP[i];
}
else
{
precision[i] = 0;
recall[i] = 0;
}
f1[i] = 2 * precision[i] * recall[i] / (precision[i] + recall[i]);
}
result.average_precision = MathUtility.average(precision);
result.average_recall = MathUtility.average(recall);
result.average_f1 = 2 * result.average_precision * result.average_recall
/ (result.average_precision + result.average_recall);
result.average_accuracy /= (double) result.size;
result.accuracy = accuracy;
result.precision = precision;
result.recall = recall;
result.f1 = f1;
return result;
}
}

+ 100
- 0
core/src/main/java/com/hankcs/hanlp/classification/statistics/evaluations/FMeasure.java View File

@@ -0,0 +1,100 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/17 PM3:11</create-date>
*
* <copyright file="FMeasure.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.statistics.evaluations;

import java.io.Serializable;

public class FMeasure implements Serializable
{
/**
* 测试样本空间
*/
int size;
/**
* 平均准确率
*/
public double average_accuracy;
/**
* 平均精确率
*/
public double average_precision;
/**
* 平均召回率
*/
public double average_recall;
/**
* 平均F1
*/
public double average_f1;

/**
* 分类准确率
*/
public double accuracy[];
/**
* 分类精确率
*/
public double precision[];
/**
* 分类召回率
*/
public double recall[];
/**
* 分类F1
*/
public double[] f1;
/**
* 分类名称
*/
public String[] catalog;

/**
* 速度
*/
public double speed;

@Override
public String toString()
{
int l = -1;
for (String c : catalog)
{
l = Math.max(l, c.length());
}
final int w = 6;
final StringBuilder sb = new StringBuilder(10000);

printf(sb, "%*s\t%*s\t%*s\t%*s\t%*s%n".replace('*', Character.forDigit(w, 10)), "P", "R", "F1", "A", "");
for (int i = 0; i < catalog.length; i++)
{
printf(sb, ("%*.2f\t%*.2f\t%*.2f\t%*.2f\t%"+l+"s%n").replace('*', Character.forDigit(w, 10)),
precision[i] * 100.,
recall[i] * 100.,
f1[i] * 100.,
accuracy[i] * 100.,
catalog[i]);
}
printf(sb, ("%*.2f\t%*.2f\t%*.2f\t%*.2f\t%"+l+"s%n").replace('*', Character.forDigit(w, 10)),
average_precision * 100.,
average_recall * 100.,
average_f1 * 100.,
average_accuracy * 100.,
"avg.");
printf(sb, "data size = %d, speed = %.2f doc/s\n", size, speed);
return sb.toString();
}

private static void printf(StringBuilder sb, String format, Object... args)
{
sb.append(String.format(format, args));
}
}

+ 77
- 0
core/src/main/java/com/hankcs/hanlp/classification/tokenizers/BigramTokenizer.java View File

@@ -0,0 +1,77 @@
package com.hankcs.hanlp.classification.tokenizers;

import com.hankcs.hanlp.dictionary.other.CharTable;
import com.hankcs.hanlp.dictionary.other.CharType;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

public class BigramTokenizer implements ITokenizer
{
public String[] segment(String text)
{
if (text.length() == 0) return new String[0];
char[] charArray = text.toCharArray();
CharTable.normalization(charArray);

// 先拆成字
List<int[]> atomList = new LinkedList<int[]>();
int start = 0;
int end = charArray.length;
int offsetAtom = start;
byte preType = CharType.get(charArray[offsetAtom]);
byte curType;
while (++offsetAtom < end)
{
curType = CharType.get(charArray[offsetAtom]);
if (preType == CharType.CT_CHINESE)
{
atomList.add(new int[]{start, offsetAtom - start});
start = offsetAtom;
}
else if (curType != preType)
{
// 浮点数识别
if (charArray[offsetAtom] == '.' && preType == CharType.CT_NUM)
{
while (++offsetAtom < end)
{
curType = CharType.get(charArray[offsetAtom]);
if (curType != CharType.CT_NUM) break;
}
}
if (preType == CharType.CT_NUM || preType == CharType.CT_LETTER) atomList.add(new int[]{start, offsetAtom - start});
start = offsetAtom;
}
preType = curType;
}
if (offsetAtom == end)
if (preType == CharType.CT_NUM || preType == CharType.CT_LETTER) atomList.add(new int[]{start, offsetAtom - start});
if (atomList.isEmpty()) return new String[0];
// 输出
String[] termArray = new String[atomList.size() - 1];
Iterator<int[]> iterator = atomList.iterator();
int[] pre = iterator.next();
int p = -1;
while (iterator.hasNext())
{
int[] cur = iterator.next();
termArray[++p] = new StringBuilder(pre[1] + cur[1]).append(charArray, pre[0], pre[1]).append(charArray, cur[0], cur[1]).toString();
pre = cur;
}

return termArray;
}

// public static void main(String args[])
// {
// BigramTokenizer bws = new BigramTokenizer();
// String[] result = bws.segment("@hankcs你好,广阔的世界2016!\u0000\u0000\t\n\r\n慶祝Coding worlds!");
// for (String str : result)
// {
// System.out.println(str);
// }
// }

}

+ 24
- 0
core/src/main/java/com/hankcs/hanlp/classification/tokenizers/BlankTokenizer.java View File

@@ -0,0 +1,24 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM5:37</create-date>
*
* <copyright file="BlankTokenizer.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.tokenizers;

/**
* 使用\\s(如空白符)进行切分的分词器
* @author hankcs
*/
public class BlankTokenizer implements ITokenizer
{
public String[] segment(String text)
{
return text.split("\\s");
}
}

+ 46
- 0
core/src/main/java/com/hankcs/hanlp/classification/tokenizers/HanLPTokenizer.java View File

@@ -0,0 +1,46 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/13 PM8:02</create-date>
*
* <copyright file="HanLPTokenizer.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.tokenizers;

import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;

import java.util.List;
import java.util.ListIterator;

/**
* @author hankcs
*/
public class HanLPTokenizer implements ITokenizer
{
public String[] segment(String text)
{
char[] charArray = text.toCharArray();
List<Term> termList = NotionalTokenizer.segment(charArray);
ListIterator<Term> listIterator = termList.listIterator();
while (listIterator.hasNext())
{
Term term = listIterator.next();
if (term.word.indexOf('\u0000') >= 0)
{
listIterator.remove();
}
}
String[] termArray = new String[termList.size()];
int p = -1;
for (Term term : termList)
{
termArray[++p] = term.word;
}
return termArray;
}
}

+ 22
- 0
core/src/main/java/com/hankcs/hanlp/classification/tokenizers/ITokenizer.java View File

@@ -0,0 +1,22 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/10 PM5:35</create-date>
*
* <copyright file="ITokenizer.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.tokenizers;

import java.io.Serializable;

/**
* @author hankcs
*/
public interface ITokenizer extends Serializable
{
String[] segment(String text);
}

+ 99
- 0
core/src/main/java/com/hankcs/hanlp/classification/utilities/CollectionUtility.java View File

@@ -0,0 +1,99 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/6 PM5:01</create-date>
*
* <copyright file="ToolUtility.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.utilities;

import java.util.*;

/**
* @author hankcs
*/
public class CollectionUtility
{
public static <K, V extends Comparable<V>> Map<K, V> sortMapByValue(Map<K, V> input, final boolean desc)
{
LinkedHashMap<K, V> output = new LinkedHashMap<K, V>(input.size());
ArrayList<Map.Entry<K, V>> entryList = new ArrayList<Map.Entry<K, V>>(input.entrySet());
Collections.sort(entryList, new Comparator<Map.Entry<K, V>>()
{
public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2)
{
if (desc) return o2.getValue().compareTo(o1.getValue());
return o1.getValue().compareTo(o2.getValue());
}
});
for (Map.Entry<K, V> entry : entryList)
{
output.put(entry.getKey(), entry.getValue());
}

return output;
}

public static <K, V extends Comparable<V>> Map<K, V> sortMapByValue(Map<K, V> input)
{
return sortMapByValue(input, true);
}

public static String max(Map<String, Double> scoreMap)
{
double max = Double.NEGATIVE_INFINITY;
String best = null;
for (Map.Entry<String, Double> entry : scoreMap.entrySet())
{
Double score = entry.getValue();
if (score > max)
{
max = score;
best = entry.getKey();
}
}

return best;
}

/**
* 分割数组为两个数组
* @param src 原数组
* @param rate 第一个数组所占的比例
* @return 两个数组
*/
public static String[][] spiltArray(String[] src, double rate)
{
assert 0 <= rate && rate <= 1;
String[][] output = new String[2][];
output[0] = new String[(int) (src.length * rate)];
output[1] = new String[src.length - output[0].length];
System.arraycopy(src, 0, output[0], 0, output[0].length);
System.arraycopy(src, output[0].length, output[1], 0, output[1].length);
return output;
}

/**
* 分割Map,其中旧map直接被改变
* @param src
* @param rate
* @return
*/
public static Map<String, String[]> splitMap(Map<String, String[]> src, double rate)
{
assert 0 <= rate && rate <= 1;
Map<String, String[]> output = new TreeMap<String, String[]>();
for (Map.Entry<String, String[]> entry : src.entrySet())
{
String[][] array = spiltArray(entry.getValue(), rate);
output.put(entry.getKey(), array[0]);
entry.setValue(array[1]);
}

return output;
}
}

+ 150
- 0
core/src/main/java/com/hankcs/hanlp/classification/utilities/TextProcessUtility.java View File

@@ -0,0 +1,150 @@
package com.hankcs.hanlp.classification.utilities;

import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.NotionalTokenizer;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*;

/**
* 文件预处理工具
*/
public class TextProcessUtility
{
/**
* 预处理,去除标点,空格和停用词
*
* @param text
* @return
*/
public static String preprocess(String text)
{
return text.replaceAll("\\p{P}", " ").replaceAll("\\s+", " ").toLowerCase(Locale.getDefault());
}

/**
* 提取关键词,在真实的应用场景中,还应该涉及到短语
*
* @param text
* @return
*/
public static String[] extractKeywords(String text)
{
List<Term> termList = NotionalTokenizer.segment(text);
String[] wordArray = new String[termList.size()];
Iterator<Term> iterator = termList.iterator();
for (int i = 0; i < wordArray.length; i++)
{
wordArray[i] = iterator.next().word;
}
return wordArray;
}

/**
* 统计每个词的词频
*
* @param keywordArray
* @return
*/
public static Map<String, Integer> getKeywordCounts(String[] keywordArray)
{
Map<String, Integer> counts = new HashMap<String, Integer>();

Integer counter;
for (int i = 0; i < keywordArray.length; ++i)
{
counter = counts.get(keywordArray[i]);
if (counter == null)
{
counter = 0;
}
counts.put(keywordArray[i], ++counter); //增加词频
}

return counts;
}

/**
* 加载一个文件夹下的所有语料
*
* @param path
* @return
*/
public static Map<String, String[]> loadCorpus(String path)
{
Map<String, String[]> dataSet = new TreeMap<String, String[]>();
File root = new File(path);
File[] folders = root.listFiles();
if (folders == null) return null;
for (File folder : folders)
{
if (folder.isFile()) continue;
File[] files = folder.listFiles();
if (files == null) continue;
String[] documents = new String[files.length];
for (int i = 0; i < files.length; i++)
{
documents[i] = IOUtil.readTxt(files[i].getAbsolutePath());
}
dataSet.put(folder.getName(), documents);
}

return dataSet;
}

/**
* 加载一个文件夹下的所有语料
*
* @param folderPath
* @return
*/
public static Map<String, String[]> loadCorpusWithException(String folderPath, String charsetName) throws IOException
{
if (folderPath == null) throw new IllegalArgumentException("参数 folderPath == null");
File root = new File(folderPath);
if (!root.exists()) throw new IllegalArgumentException(String.format("目录 %s 不存在", root.getAbsolutePath()));
if (!root.isDirectory())
throw new IllegalArgumentException(String.format("目录 %s 不是一个目录", root.getAbsolutePath()));

Map<String, String[]> dataSet = new TreeMap<String, String[]>();
File[] folders = root.listFiles();
if (folders == null) return null;
for (File folder : folders)
{
if (folder.isFile()) continue;
File[] files = folder.listFiles();
if (files == null) continue;
String[] documents = new String[files.length];
for (int i = 0; i < files.length; i++)
{
documents[i] = readTxt(files[i], charsetName);
}
dataSet.put(folder.getName(), documents);
}

return dataSet;
}

public static String readTxt(File file, String charsetName) throws IOException
{
FileInputStream is = new FileInputStream(file);
byte[] targetArray = new byte[is.available()];
int len;
int off = 0;
while ((len = is.read(targetArray, off, targetArray.length - off)) != -1 && off < targetArray.length)
{
off += len;
}
is.close();

return new String(targetArray, charsetName);
}

public static Map<String, String[]> loadCorpusWithException(String corpusPath) throws IOException
{
return loadCorpusWithException(corpusPath, "UTF-8");
}
}

+ 47
- 0
core/src/main/java/com/hankcs/hanlp/classification/utilities/io/ConsoleLogger.java View File

@@ -0,0 +1,47 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/16 AM11:17</create-date>
*
* <copyright file="ConsoleLogger.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.utilities.io;

/**
* 输出到stdout和stderr的日志系统
*
* @author hankcs
*/
public class ConsoleLogger implements ILogger
{
/**
* 默认日志
*/
public static ILogger logger = new ConsoleLogger();
long start;

public void out(String format, Object... args)
{
System.out.printf(format, args);
}

public void err(String format, Object... args)
{
System.err.printf(format, args);
}

public void start(String format, Object... args)
{
out(format, args);
start = System.currentTimeMillis();
}

public void finish(String format, Object... args)
{
out(String.format("耗时 %d ms", System.currentTimeMillis() - start) + format, args);
}
}

+ 24
- 0
core/src/main/java/com/hankcs/hanlp/classification/utilities/io/ILogger.java View File

@@ -0,0 +1,24 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>me@hankcs.com</email>
* <create-date>16/2/16 AM11:15</create-date>
*
* <copyright file="ILogger.java" company="码农场">
* Copyright (c) 2008-2016, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.classification.utilities.io;

/**
* 一个简单的日志接口
* @author hankcs
*/
public interface ILogger
{
void out(String format, Object ... args);
void err(String format, Object ... args);
void start(String format, Object ... args);
void finish(String format, Object ... args);
}

+ 1018
- 0
core/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/AhoCorasickDoubleArrayTrie.java
File diff suppressed because it is too large
View File


+ 237
- 0
core/src/main/java/com/hankcs/hanlp/collection/AhoCorasick/State.java View File

@@ -0,0 +1,237 @@
package com.hankcs.hanlp.collection.AhoCorasick;

import java.util.*;

/**
* <p>
* 一个状态有如下几个功能
* </p>
* <p/>
* <ul>
* <li>success; 成功转移到另一个状态</li>
* <li>failure; 不可顺着字符串跳转的话,则跳转到一个浅一点的节点</li>
* <li>emits; 命中一个模式串</li>
* </ul>
* <p/>
* <p>
* 根节点稍有不同,根节点没有 failure 功能,它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。
* </p>
*
* @author Robert Bor
*/
public class State
{

/**
* 模式串的长度,也是这个状态的深度
*/
protected final int depth;

/**
* fail 函数,如果没有匹配到,则跳转到此状态。
*/
private State failure = null;

/**
* 只要这个状态可达,则记录模式串
*/
private Set<Integer> emits = null;
/**
* goto 表,也称转移函数。根据字符串的下一个字符转移到下一个状态
*/
private Map<Character, State> success = new TreeMap<Character, State>();

/**
* 在双数组中的对应下标
*/
private int index;

/**
* 构造深度为0的节点
*/
public State()
{
this(0);
}

/**
* 构造深度为depth的节点
* @param depth
*/
public State(int depth)
{
this.depth = depth;
}

/**
* 获取节点深度
* @return
*/
public int getDepth()
{
return this.depth;
}

/**
* 添加一个匹配到的模式串(这个状态对应着这个模式串)
* @param keyword
*/
public void addEmit(int keyword)
{
if (this.emits == null)
{
this.emits = new TreeSet<Integer>(Collections.reverseOrder());
}
this.emits.add(keyword);
}

/**
* 获取最大的值
* @return
*/
public Integer getLargestValueId()
{
if (emits == null || emits.size() == 0) return null;

return emits.iterator().next();
}

/**
* 添加一些匹配到的模式串
* @param emits
*/
public void addEmit(Collection<Integer> emits)
{
for (int emit : emits)
{
addEmit(emit);
}
}

/**
* 获取这个节点代表的模式串(们)
* @return
*/
public Collection<Integer> emit()
{
return this.emits == null ? Collections.<Integer>emptyList() : this.emits;
}

/**
* 是否是终止状态
* @return
*/
public boolean isAcceptable()
{
return this.depth > 0 && this.emits != null;
}

/**
* 获取failure状态
* @return
*/
public State failure()
{
return this.failure;
}

/**
* 设置failure状态
* @param failState
*/
public void setFailure(State failState, int fail[])
{
this.failure = failState;
fail[index] = failState.index;
}

/**
* 转移到下一个状态
* @param character 希望按此字符转移
* @param ignoreRootState 是否忽略根节点,如果是根节点自己调用则应该是true,否则为false
* @return 转移结果
*/
private State nextState(Character character, boolean ignoreRootState)
{
State nextState = this.success.get(character);
if (!ignoreRootState && nextState == null && this.depth == 0)
{
nextState = this;
}
return nextState;
}

/**
* 按照character转移,根节点转移失败会返回自己(永远不会返回null)
* @param character
* @return
*/
public State nextState(Character character)
{
return nextState(character, false);
}

/**
* 按照character转移,任何节点转移失败会返回null
* @param character
* @return
*/
public State nextStateIgnoreRootState(Character character)
{
return nextState(character, true);
}

public State addState(Character character)
{
State nextState = nextStateIgnoreRootState(character);
if (nextState == null)
{
nextState = new State(this.depth + 1);
this.success.put(character, nextState);
}
return nextState;
}

public Collection<State> getStates()
{
return this.success.values();
}

public Collection<Character> getTransitions()
{
return this.success.keySet();
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder("State{");
sb.append("depth=").append(depth);
sb.append(", ID=").append(index);
sb.append(", emits=").append(emits);
sb.append(", success=").append(success.keySet());
sb.append(", failureID=").append(failure == null ? "-1" : failure.index);
sb.append(", failure=").append(failure);
sb.append('}');
return sb.toString();
}

/**
* 获取goto表
* @return
*/
public Map<Character, State> getSuccess()
{
return success;
}

public int getIndex()
{
return index;
}

public void setIndex(int index)
{
this.index = index;
}
}

+ 1092
- 0
core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAG.java
File diff suppressed because it is too large
View File


+ 211
- 0
core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAGMap.java View File

@@ -0,0 +1,211 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/12/21 18:59</create-date>
*
* <copyright file="MDAGMap.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.MDAG;

import com.hankcs.hanlp.utility.ByteUtil;

import java.util.*;

/**
* 最好不要把MDAG当map用,现在的实现在key后面放一个int,导致右语言全部不同,退化为bintrie
* @author hankcs
*/
public class MDAGMap<V> extends AbstractMap<String, V>
{
ArrayList<V> valueList = new ArrayList<V>();
MDAGForMap mdag = new MDAGForMap();

@Override
public V put(String key, V value)
{
V origin = get(key);
if (origin == null)
{
valueList.add(value);
char[] twoChar = ByteUtil.convertIntToTwoChar(valueList.size() - 1);
mdag.addString(key + MDAGForMap.DELIMITER + twoChar[0] + twoChar[1]);
}
return origin;
}

@Override
public V get(Object key)
{
int valueIndex = mdag.getValueIndex(key.toString());
if (valueIndex != -1)
{
return valueList.get(valueIndex);
}
return null;
}

public V get(String key)
{
int valueIndex = mdag.getValueIndex(key);
if (valueIndex != -1)
{
return valueList.get(valueIndex);
}
return null;
}

@Override
public Set<Entry<String, V>> entrySet()
{
HashSet<String> keySet = mdag.getAllStrings();
return null;
}

@Override
public Set<String> keySet()
{
HashSet<String> stringSet = mdag.getAllStrings();
LinkedHashSet<String> keySet = new LinkedHashSet<String>();
Iterator<String> iterator = stringSet.iterator();
while (iterator.hasNext())
{
String key = iterator.next();
keySet.add(key.substring(0, key.length() - 3));
}
return keySet;
}

/**
* 前缀查询
* @param key
* @param begin
* @return
*/
public LinkedList<Entry<String, V>> commonPrefixSearchWithValue(char[] key, int begin)
{
LinkedList<Entry<String, Integer>> valueIndex = mdag.commonPrefixSearchWithValueIndex(key, begin);
LinkedList<Entry<String, V>> entryList = new LinkedList<Entry<String, V>>();
for (Entry<String, Integer> entry : valueIndex)
{
entryList.add(new SimpleEntry<String, V>(entry.getKey(), valueList.get(entry.getValue())));
}

return entryList;
}

/**
* 前缀查询
* @param key
* @return
*/
public LinkedList<Entry<String, V>> commonPrefixSearchWithValue(String key)
{
return commonPrefixSearchWithValue(key.toCharArray(), 0);
}

/**
* 进一步降低内存,提高查询速度<br>
* 副作用是下次插入速度会变慢
*/
public void simplify()
{
mdag.simplify();
}

public void unSimplify()
{
mdag.unSimplify();
}

static class MDAGForMap extends MDAG
{
static final char DELIMITER = Character.MIN_VALUE;

public int getValueIndex(String key)
{
if (sourceNode != null) //if the MDAG hasn't been simplified
{
MDAGNode currentNode = sourceNode.transition(key.toCharArray());
if (currentNode == null) return -1;
return getValueIndex(currentNode);

}
else
{
SimpleMDAGNode currentNode = simplifiedSourceNode.transition(mdagDataArray, key.toCharArray());
if (currentNode == null) return -1;
return getValueIndex(currentNode);
}

}

private int getValueIndex(SimpleMDAGNode currentNode)
{
SimpleMDAGNode targetNode = currentNode.transition(mdagDataArray, DELIMITER);
if (targetNode == null) return -1;
// 接下来应该是一条单链路
int transitionSetBeginIndex = targetNode.getTransitionSetBeginIndex();
assert targetNode.getOutgoingTransitionSetSize() == 1 : "不是单链!";
char high = mdagDataArray[transitionSetBeginIndex].getLetter();
targetNode = targetNode.transition(mdagDataArray, high);
assert targetNode.getOutgoingTransitionSetSize() == 1 : "不是单链!";
transitionSetBeginIndex = targetNode.getTransitionSetBeginIndex();
char low = mdagDataArray[transitionSetBeginIndex].getLetter();
return ByteUtil.convertTwoCharToInt(high, low);
}

private int getValueIndex(MDAGNode currentNode)
{
MDAGNode targetNode = currentNode.transition(DELIMITER);
if (targetNode == null) return -1;
// 接下来应该是一条单链路
TreeMap<Character, MDAGNode> outgoingTransitions = targetNode.getOutgoingTransitions();
assert outgoingTransitions.size() == 1 : "不是单链!";
char high = outgoingTransitions.keySet().iterator().next();
targetNode = targetNode.transition(high);
outgoingTransitions = targetNode.getOutgoingTransitions();
assert outgoingTransitions.size() == 1 : "不是单链!";
char low = outgoingTransitions.keySet().iterator().next();
return ByteUtil.convertTwoCharToInt(high, low);
}

public LinkedList<Entry<String, Integer>> commonPrefixSearchWithValueIndex(char[] key, int begin)
{
LinkedList<Map.Entry<String, Integer>> result = new LinkedList<Map.Entry<String, Integer>>();
if (sourceNode != null)
{
int charCount = key.length;
MDAGNode currentNode = sourceNode;
for (int i = 0; i < charCount; ++i)
{
currentNode = currentNode.transition(key[begin + i]);
if (currentNode == null) break;
{
int index = getValueIndex(currentNode);
if (index != -1) result.add(new SimpleEntry<String, Integer>(new String(key, begin, i + 1), index));
}
}
}
else
{
int charCount = key.length;
SimpleMDAGNode currentNode = simplifiedSourceNode;
for (int i = 0; i < charCount; ++i)
{
currentNode = currentNode.transition(mdagDataArray, key[begin + i]);
if (currentNode == null) break;
{
int index = getValueIndex(currentNode);
if (index != -1) result.add(new SimpleEntry<String, Integer>(new String(key, begin, i + 1), index));
}
}
}

return result;
}
}
}

+ 548
- 0
core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAGNode.java View File

@@ -0,0 +1,548 @@
/**
* MDAG is a Java library capable of constructing character-sequence-storing,
* directed acyclic graphs of minimal size.
*
* Copyright (C) 2012 Kevin Lawson <Klawson88@gmail.com>
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.hankcs.hanlp.collection.MDAG;

import java.util.Map.Entry;
import java.util.Stack;
import java.util.TreeMap;


/**
* MDAG中的一个节点<br>
* The class which represents a node in a MDAG.

* @author Kevin
*/
public class MDAGNode
{
//The boolean denoting the accept state status of this node
/**
* 是否是终止状态
*/
private boolean isAcceptNode;
//The TreeMap to contain entries that represent a _transition (label and target node)
/**
* 状态转移函数
*/
private final TreeMap<Character, MDAGNode> outgoingTransitionTreeMap;

//The int representing this node's incoming _transition node count
/**
* 入度
*/
private int incomingTransitionCount = 0;
//The int denoting index in a simplified mdag data array that this node's _transition set begins at
/**
* 在简化的MDAG中表示该节点的转移状态集合的起始位置
*/
private int transitionSetBeginIndex = -1;
//The int which will store this node's hash code after its been calculated (necessary due to how expensive the hashing calculation is)
/**
* 当它被计算后的hash值
*/
private Integer storedHashCode = null;
/**
* 建立一个节点<br>
* Constructs an MDAGNode.
* @param isAcceptNode 是否是终止状态 a boolean denoting the accept state status of this node
*/
public MDAGNode(boolean isAcceptNode)
{
this.isAcceptNode = isAcceptNode;
outgoingTransitionTreeMap = new TreeMap<Character, MDAGNode>();
}

/**
* 克隆一个状态<br>
* Constructs an MDAGNode possessing the same accept state status and outgoing transitions as another.
* @param node the MDAGNode possessing the accept state status and
* outgoing transitions that the to-be-created MDAGNode is to take on
*/
private MDAGNode(MDAGNode node)
{
isAcceptNode = node.isAcceptNode;
outgoingTransitionTreeMap = new TreeMap<Character, MDAGNode>(node.outgoingTransitionTreeMap);
//Loop through the nodes in this node's outgoing _transition set, incrementing the number of
//incoming transitions of each by 1 (to account for this newly created node's outgoing transitions)
for(Entry<Character, MDAGNode> transitionKeyValuePair : outgoingTransitionTreeMap.entrySet())
transitionKeyValuePair.getValue().incomingTransitionCount++;
/////
}
/**
* 克隆一个状态<br>
* Creates an MDAGNode possessing the same accept state status and outgoing transitions as this node.
* @return an MDAGNode possessing the same accept state status and outgoing transitions as this node
*/
public MDAGNode clone()
{
return new MDAGNode(this);
}
/**
* 克隆一个状态<br>
* 原来soleParentNode转移到本状态,现在转移到克隆后的状态
* Creates an MDAGNode possessing the same accept state status ant _transition set
* (incoming & outgoing) as this node. outgoing transitions as this node.
* @param soleParentNode the MDAGNode possessing the only _transition that targets this node
* @param parentToCloneTransitionLabelChar the char which labels the _transition from {@code soleParentNode} to this node
* @return an MDAGNode possessing the same accept state status and _transition set as this node.
*/
public MDAGNode clone(MDAGNode soleParentNode, char parentToCloneTransitionLabelChar)
{
MDAGNode cloneNode = new MDAGNode(this);
soleParentNode.reassignOutgoingTransition(parentToCloneTransitionLabelChar, this, cloneNode);
return cloneNode;
}

/**
* Retrieves the index in a simplified mdag data array that the SimpleMDAGNode
* representation of this node's outgoing _transition set begins at.
* @return the index in a simplified mdag data array that this node's _transition set begins at,
* or -1 if its _transition set is not present in such an array
*/
public int getTransitionSetBeginIndex()
{
return transitionSetBeginIndex;
}
/**
* Retrieves this node's outgoing _transition count.
* @return an int representing this node's number of outgoing transitions
*/
public int getOutgoingTransitionCount()
{
return outgoingTransitionTreeMap.size();
}
/**
* Retrieves this node's incoming _transition count
* @return an int representing this node's number of incoming transitions
*/
public int getIncomingTransitionCount()
{
return incomingTransitionCount;
}
/**
* Determines if this node is a confluence node
* (defined as a node with two or more incoming transitions
* @return true if this node has two or more incoming transitions, false otherwise
*/
public boolean isConfluenceNode()
{
return (incomingTransitionCount > 1);
}
/**
* Retrieves the accept state status of this node.
* @return true if this node is an accept state, false otherwise
*/
public boolean isAcceptNode()
{
return isAcceptNode;
}
/**
* Sets this node's accept state status.
*
* @param isAcceptNode a boolean representing the desired accept state status
*/
public void setAcceptStateStatus(boolean isAcceptNode)
{
this.isAcceptNode = isAcceptNode;
}
/**
* 转移状态在数组中的起始下标<br>
* Records the index that this node's _transition set starts at
* in an array containing this node's containing MDAG data (simplified MDAG).
* @param transitionSetBeginIndex a _transition set
*/
public void setTransitionSetBeginIndex(int transitionSetBeginIndex)
{
this.transitionSetBeginIndex = transitionSetBeginIndex;
}
/**
* Determines whether this node has an outgoing _transition with a given label.
* @param letter the char labeling the desired _transition
* @return true if this node possesses a _transition labeled with
* {@code letter}, and false otherwise
*/
public boolean hasOutgoingTransition(char letter)
{
return outgoingTransitionTreeMap.containsKey(letter);
}
/**
* Determines whether this node has any outgoing transitions.
* @return true if this node has at least one outgoing _transition, false otherwise
*/
public boolean hasTransitions()
{
return !outgoingTransitionTreeMap.isEmpty();
}
/**
* Follows an outgoing _transition of this node labeled with a given char.
* @param letter the char representation of the desired _transition's label
* @return the MDAGNode that is the target of the _transition labeled with {@code letter},
* or null if there is no such labeled _transition from this node
*/
public MDAGNode transition(char letter)
{
return outgoingTransitionTreeMap.get(letter);
}
/**
* 沿着一个路径转移<br>
* Follows a _transition path starting from this node.
* @param str a String corresponding a _transition path in the MDAG
* @return the MDAGNode at the end of the _transition path corresponding to
* {@code str}, or null if such a _transition path is not present in the MDAG
*/
public MDAGNode transition(String str)
{
int charCount = str.length();
MDAGNode currentNode = this;
//Iteratively _transition through the MDAG using the chars in str
for(int i = 0; i < charCount; i++)
{
currentNode = currentNode.transition(str.charAt(i));
if(currentNode == null) break;
}
/////
return currentNode;
}

public MDAGNode transition(char[] str)
{
int charCount = str.length;
MDAGNode currentNode = this;

//Iteratively _transition through the MDAG using the chars in str
for(int i = 0; i < charCount; ++i)
{
currentNode = currentNode.transition(str[i]);
if(currentNode == null) break;
}
/////

return currentNode;
}

public MDAGNode transition(char[] str, int offset)
{
int charCount = str.length - offset;
MDAGNode currentNode = this;

//Iteratively _transition through the MDAG using the chars in str
for(int i = 0; i < charCount; ++i)
{
currentNode = currentNode.transition(str[i + offset]);
if(currentNode == null) break;
}
/////

return currentNode;
}

/**
* 获取一个字符串路径上经过的节点<br>
* Retrieves the nodes in the _transition path starting
* from this node corresponding to a given String .
* @param str a String corresponding to a _transition path starting from this node
* @return a Stack of MDAGNodes containing the nodes in the _transition path
* denoted by {@code str}, in the order they are encountered in during transitioning
*/
public Stack<MDAGNode> getTransitionPathNodes(String str)
{
Stack<MDAGNode> nodeStack = new Stack<MDAGNode>();
MDAGNode currentNode = this;
int numberOfChars = str.length();
//Iteratively _transition through the MDAG using the chars in str,
//putting each encountered node in nodeStack
for(int i = 0; i < numberOfChars && currentNode != null; i++)
{
currentNode = currentNode.transition(str.charAt(i));
nodeStack.add(currentNode);
}
/////
return nodeStack;
}

/**
* Retrieves this node's outgoing transitions.
* @return a TreeMap containing entries collectively representing
* all of this node's outgoing transitions
*/
public TreeMap<Character, MDAGNode> getOutgoingTransitions()
{
return outgoingTransitionTreeMap;
}
/**
* 本状态的目标状态们的入度减一
* Decrements (by 1) the incoming _transition counts of all of the nodes
* that are targets of outgoing transitions from this node.
*/
public void decrementTargetIncomingTransitionCounts()
{
for(Entry<Character, MDAGNode> transitionKeyValuePair: outgoingTransitionTreeMap.entrySet())
transitionKeyValuePair.getValue().incomingTransitionCount--;
}
/**
* 重新设置转移状态函数的目标
* Reassigns the target node of one of this node's outgoing transitions.
* @param letter the char which labels the outgoing _transition of interest
* @param oldTargetNode the MDAGNode that is currently the target of the _transition of interest
* @param newTargetNode the MDAGNode that is to be the target of the _transition of interest
*/
public void reassignOutgoingTransition(char letter, MDAGNode oldTargetNode, MDAGNode newTargetNode)
{
oldTargetNode.incomingTransitionCount--;
newTargetNode.incomingTransitionCount++;
outgoingTransitionTreeMap.put(letter, newTargetNode);
}
/**
* 新建一个转移目标<br>
* Creates an outgoing _transition labeled with a
* given char that has a new node as its target.
* @param letter a char representing the desired label of the _transition
* @param targetAcceptStateStatus a boolean representing to-be-created _transition target node's accept status
* @return the (newly created) MDAGNode that is the target of the created _transition
*/
public MDAGNode addOutgoingTransition(char letter, boolean targetAcceptStateStatus)
{
MDAGNode newTargetNode = new MDAGNode(targetAcceptStateStatus);
newTargetNode.incomingTransitionCount++;
outgoingTransitionTreeMap.put(letter, newTargetNode);
return newTargetNode;
}

/**
* 建立一条边(起点是自己)
* @param letter 边上的字符串
* @param newTargetNode 边的重点
* @return 终点
*/
public MDAGNode addOutgoingTransition(char letter, MDAGNode newTargetNode)
{
newTargetNode.incomingTransitionCount++;

outgoingTransitionTreeMap.put(letter, newTargetNode);
return newTargetNode;
}
/**
* 移除一个转移目标<br>
* Removes a _transition labeled with a given char. This only removes the connection
* between this node and the _transition's target node; the target node is not deleted.
* @param letter the char labeling the _transition of interest
*/
public void removeOutgoingTransition(char letter)
{
outgoingTransitionTreeMap.remove(letter);
}


/**
* 是否含有相同的转移函数
* @param node1
* @param node2
* @return
*/
public static boolean haveSameTransitions(MDAGNode node1, MDAGNode node2)
{
TreeMap<Character, MDAGNode> outgoingTransitionTreeMap1 = node1.outgoingTransitionTreeMap;
TreeMap<Character, MDAGNode> outgoingTransitionTreeMap2 = node2.outgoingTransitionTreeMap;
if(outgoingTransitionTreeMap1.size() == outgoingTransitionTreeMap2.size())
{
//For each _transition in outgoingTransitionTreeMap1, get the identically lableed _transition
//in outgoingTransitionTreeMap2 (if present), and test the equality of the transitions' target nodes
for(Entry<Character, MDAGNode> transitionKeyValuePair : outgoingTransitionTreeMap1.entrySet())
{
Character currentCharKey = transitionKeyValuePair.getKey();
MDAGNode currentTargetNode = transitionKeyValuePair.getValue();
if(!outgoingTransitionTreeMap2.containsKey(currentCharKey) || !outgoingTransitionTreeMap2.get(currentCharKey).equals(currentTargetNode))
return false;
}
/////
}
else
return false;
return true;
}
/**
* Clears this node's stored hash value
*/
public void clearStoredHashCode()
{
storedHashCode = null;
}
/**
* 两个状态是否等价,只有状态转移函数完全一致才算相等<br>
* Evaluates the equality of this node with another object.
* This node is equal to obj if and only if obj is also an MDAGNode,
* and the set of transitions paths from this node and obj are equivalent.
* @param obj an object
* @return true of {@code obj} is an MDAGNode and the set of
* _transition paths from this node and obj are equivalent
*/
@Override
public boolean equals(Object obj)
{
boolean areEqual = (this == obj);
if(!areEqual && obj != null && obj.getClass().equals(MDAGNode.class))
{
MDAGNode node = (MDAGNode)obj;
areEqual = (isAcceptNode == node.isAcceptNode && haveSameTransitions(this, node));
}
return areEqual;
}

/**
* Hashes this node using its accept state status and set of outgoing _transition paths.
* This is an expensive operation, so the result is cached and only cleared when necessary.
* @return an int of this node's hash code
*/
@Override
public int hashCode() {
if(storedHashCode == null)
{
int hash = 7;
hash = 53 * hash + (this.isAcceptNode ? 1 : 0);
hash = 53 * hash + (this.outgoingTransitionTreeMap != null ? this.outgoingTransitionTreeMap.hashCode() : 0); //recursively hashes the nodes in all the
//_transition paths stemming from this node
storedHashCode = hash;
return hash;
}
else
return storedHashCode;
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder("MDAGNode{");
sb.append("isAcceptNode=").append(isAcceptNode);
sb.append(", outgoingTransitionTreeMap=").append(outgoingTransitionTreeMap.keySet());
sb.append(", incomingTransitionCount=").append(incomingTransitionCount);
// sb.append(", transitionSetBeginIndex=").append(transitionSetBeginIndex);
// sb.append(", storedHashCode=").append(storedHashCode);
sb.append('}');
return sb.toString();
}
}

+ 165
- 0
core/src/main/java/com/hankcs/hanlp/collection/MDAG/MDAGSet.java View File

@@ -0,0 +1,165 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/12/20 22:10</create-date>
*
* <copyright file="MDAGSet.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.MDAG;

import java.io.File;
import java.io.IOException;
import java.util.*;

/**
* 基于MDAG(又称DAWG,Minimal Acyclic Finite-State Automata)的String Set
*
* @author hankcs
*/
public class MDAGSet extends MDAG implements Set<String>
{

public MDAGSet(File dataFile) throws IOException
{
super(dataFile);
}

public MDAGSet(Collection<String> strCollection)
{
super(strCollection);
}

public MDAGSet()
{
}

public MDAGSet(String dictionaryPath) throws IOException
{
super(dictionaryPath);
}

@Override
public int size()
{
return getAllStrings().size();
}

@Override
public boolean isEmpty()
{
return this.equivalenceClassMDAGNodeHashMap.size() != 0;
}

@Override
public boolean contains(Object o)
{
if (o.getClass() != String.class) return false;
return contains((String) o);
}

@Override
public Iterator<String> iterator()
{
return getAllStrings().iterator();
}

@Override
public Object[] toArray()
{
return getAllStrings().toArray();
}

@Override
public <T> T[] toArray(T[] a)
{
return getAllStrings().toArray(a);
}

@Override
public boolean add(String s)
{
addString(s);
return true;
}

@Override
public boolean remove(Object o)
{
if (o.getClass() == String.class)
{
removeString((String) o);
}
else
{
removeString(o.toString());
}
return true;
}

@Override
public boolean containsAll(Collection<?> c)
{
for (Object e : c)
if (!contains(e))
return false;
return true;
}

@Override
public boolean addAll(Collection<? extends String> c)
{
boolean modified = false;
for (String e : c)
if (add(e))
modified = true;
return modified;
}

@Override
public boolean retainAll(Collection<?> c)
{
boolean modified = false;
Iterator<String> it = iterator();
while (it.hasNext())
{
if (!c.contains(it.next()))
{
it.remove();
modified = true;
}
}
return modified;
}

@Override
public boolean removeAll(Collection<?> c)
{
boolean modified = false;
Iterator<?> it = iterator();
while (it.hasNext())
{
if (c.contains(it.next()))
{
it.remove();
modified = true;
}
}
return modified;
}

@Override
public void clear()
{
sourceNode = new MDAGNode(false);
simplifiedSourceNode = null;
if (equivalenceClassMDAGNodeHashMap != null)
equivalenceClassMDAGNodeHashMap.clear();
mdagDataArray = null;
charTreeSet.clear();
transitionCount = 0;
}
}

+ 312
- 0
core/src/main/java/com/hankcs/hanlp/collection/MDAG/SimpleMDAGNode.java View File

@@ -0,0 +1,312 @@
/**
* MDAG is a Java library capable of constructing character-sequence-storing,
* directed acyclic graphs of minimal size.
*
* Copyright (C) 2012 Kevin Lawson <Klawson88@gmail.com>
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hankcs.hanlp.collection.MDAG;


import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.ICacheAble;

import java.io.DataOutputStream;

/**
* The class capable of representing a MDAG node, its _transition set, and one of its incoming transitions;
* objects of this class are used to represent a MDAG after its been simplified in order to save space.
*
* @author Kevin
*/
public class SimpleMDAGNode implements ICacheAble
{
//The character labeling an incoming _transition to this node
private char letter;

//The boolean denoting the accept state status of this node
private boolean isAcceptNode;

//The int denoting the size of this node's outgoing _transition set
private int transitionSetSize;

//The int denoting the index (in the array which contains this node) at which this node's _transition set begins
private int transitionSetBeginIndex;


/**
* Constructs a SimpleMDAGNode.
*
* @param letter a char representing the _transition label leading to this SimpleMDAGNode
* @param isAcceptNode a boolean representing the accept state status of this SimpleMDAGNode
* @param transitionSetSize an int denoting the size of this _transition set
*/
public SimpleMDAGNode(char letter, boolean isAcceptNode, int transitionSetSize)
{
this.letter = letter;
this.isAcceptNode = isAcceptNode;
this.transitionSetSize = transitionSetSize;
this.transitionSetBeginIndex = 0; //will be changed for all objects of this type, necessary for dummy root node creation
}

public SimpleMDAGNode()
{

}


/**
* Retrieves the character representing the _transition laben leading up to this node.
*
* @return the char representing the _transition label leading up to this node
*/
public char getLetter()
{
return letter;
}


/**
* Retrieves the accept state status of this node.
*
* @return true if this node is an accept state, false otherwise
*/
public boolean isAcceptNode()
{
return isAcceptNode;
}


/**
* Retrieves the index in this node's containing array that its _transition set begins at.
*
* @return an int of the index in this node's containing array at which its _transition set begins
*/
public int getTransitionSetBeginIndex()
{
return transitionSetBeginIndex;
}


/**
* Retrieves the size of this node's outgoing _transition set.
*
* @return an int denoting the size of this node's outgoing _transition set
*/
public int getOutgoingTransitionSetSize()
{
return transitionSetSize;
}


/**
* Records the index in this node's containing array that its _transition set begins at.
*
* @param transitionSetBeginIndex an int denoting the index in this node's containing array that is _transition set beings at
*/
public void setTransitionSetBeginIndex(int transitionSetBeginIndex)
{
this.transitionSetBeginIndex = transitionSetBeginIndex;
}


/**
* Follows an outgoing _transition from this node.
*
* @param mdagDataArray the array of SimpleMDAGNodes containing this node
* @param letter the char representation of the desired _transition's label
* @return the SimpleMDAGNode that is the target of the _transition labeled with {@code letter},
* or null if there is no such labeled _transition from this node
*/
public SimpleMDAGNode transition(SimpleMDAGNode[] mdagDataArray, char letter)
{
SimpleMDAGNode targetNode = null;
int offset = binarySearch(mdagDataArray, letter);
if (offset >= 0)
{
targetNode = mdagDataArray[offset];
}
/////

return targetNode;
}

private SimpleMDAGNode transitionBruteForce(SimpleMDAGNode[] mdagDataArray, char letter)
{
int onePastTransitionSetEndIndex = transitionSetBeginIndex + transitionSetSize;
SimpleMDAGNode targetNode = null;

//Loop through the SimpleMDAGNodes in this node's _transition set, searching for
//the one with a letter equal to that which labels the desired _transition
for(int i = transitionSetBeginIndex; i < onePastTransitionSetEndIndex; i++)
{
if(mdagDataArray[i].getLetter() == letter)
{
targetNode = mdagDataArray[i];
break;
}
}
/////

return targetNode;
}

/**
* 二分搜索
* @param mdagDataArray
* @param node
* @return
*/
private int binarySearch(SimpleMDAGNode[] mdagDataArray, char node)
{
if (transitionSetSize < 1)
{
return -1;
}
int high = transitionSetBeginIndex + transitionSetSize - 1;
int low = transitionSetBeginIndex;
while (low <= high)
{
int mid = ((low + high) >>> 1);
int cmp = mdagDataArray[mid].getLetter() - node;

if (cmp < 0)
low = mid + 1;
else if (cmp > 0)
high = mid - 1;
else
return mid;
}
return -1;
}


/**
* Follows a _transition path starting from this node.
*
* @param mdagDataArray the array of SimpleMDAGNodes containing this node
* @param str a String corresponding a _transition path in the MDAG
* @return the SimpleMDAGNode at the end of the _transition path corresponding to
* {@code str}, or null if such a _transition path is not present in the MDAG
*/
public SimpleMDAGNode transition(SimpleMDAGNode[] mdagDataArray, String str)
{
SimpleMDAGNode currentNode = this;
int numberOfChars = str.length();

//Iteratively _transition through the MDAG using the chars in str
for(int i = 0; i < numberOfChars; i++)
{
currentNode = currentNode.transition(mdagDataArray, str.charAt(i));
if(currentNode == null) break;
}
/////

return currentNode;
}

public SimpleMDAGNode transition(SimpleMDAGNode[] mdagDataArray, char[] str)
{
SimpleMDAGNode currentNode = this;
int numberOfChars = str.length;

//Iteratively _transition through the MDAG using the chars in str
for (int i = 0; i < numberOfChars; i++)
{
currentNode = currentNode.transition(mdagDataArray, str[i]);
if (currentNode == null) break;
}
/////

return currentNode;
}

public SimpleMDAGNode transition(SimpleMDAGNode[] mdagDataArray, char[] str, int offset)
{
SimpleMDAGNode currentNode = this;
int numberOfChars = str.length - offset;

//Iteratively _transition through the MDAG using the chars in str
for (int i = 0; i < numberOfChars; i++)
{
currentNode = currentNode.transition(mdagDataArray, str[offset + i]);
if (currentNode == null) break;
}
/////

return currentNode;
}


/**
* Follows a _transition path starting from the source node of a MDAG.
*
* @param mdagDataArray the array containing the data of the MDAG to be traversed
* @param sourceNode the dummy SimpleMDAGNode which functions as the source of the MDAG data in {@code mdagDataArray}
* @param str a String corresponding to a _transition path in the to-be-traversed MDAG
* @return the SimpleMDAGNode at the end of the _transition path corresponding to
* {@code str}, or null if such a _transition path is not present in the MDAG
*/
public static SimpleMDAGNode traverseMDAG(SimpleMDAGNode[] mdagDataArray, SimpleMDAGNode sourceNode, String str)
{
// char firstLetter = str.charAt(0);

//Loop through the SimpleMDAGNodes in the processing MDAG's source node's _transition set,
//searching for the the one with a letter (char) equal to the first char of str.
//We can use that target node to _transition through the MDAG with the rest of the string
return sourceNode.transition(mdagDataArray, str.toCharArray());
// for(int i = 0; i < sourceNode.transitionSetSize; i++)
// {
// if(mdagDataArray[i].getLetter() == firstLetter)
// return mdagDataArray[i]._transition(mdagDataArray, str.substring(1));
// }
// /////
//
// return null;
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder("SimpleMDAGNode{");
sb.append("letter=").append(letter);
sb.append(", isAcceptNode=").append(isAcceptNode);
sb.append(", transitionSetSize=").append(transitionSetSize);
sb.append(", transitionSetBeginIndex=").append(transitionSetBeginIndex);
sb.append('}');
return sb.toString();
}

@Override
public void save(DataOutputStream out) throws Exception
{
out.writeChar(letter);
out.writeByte(isAcceptNode ? 1 : 0);
out.writeInt(transitionSetBeginIndex);
out.writeInt(transitionSetSize);
}

@Override
public boolean load(ByteArray byteArray)
{
letter = byteArray.nextChar();
isAcceptNode = byteArray.nextByte() == 1;
transitionSetBeginIndex = byteArray.nextInt();
transitionSetSize = byteArray.nextInt();
return true;
}
}

+ 198
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/DartMap.java View File

@@ -0,0 +1,198 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/12/22 18:17</create-date>
*
* <copyright file="DartMap.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.dartsclone;

import com.hankcs.hanlp.collection.trie.ITrie;
import com.hankcs.hanlp.corpus.io.ByteArray;

import java.io.DataOutputStream;
import java.util.*;

/**
* 双数组trie树map,更省内存,原本希望代替DoubleArrayTrie,后来发现效率不够
* @author hankcs
*/
public class DartMap<V> extends DoubleArray implements Map<String, V>, ITrie<V>
{
V[] valueArray;

public DartMap(List<String> keyList, V[] valueArray)
{
int[] indexArray = new int[valueArray.length];
for (int i = 0; i < indexArray.length; ++i)
{
indexArray[i] = i;
}
this.valueArray = valueArray;
build(keyList, indexArray);
}

public DartMap(TreeMap<String, V> map)
{
build(map);
}

public DartMap()
{
}

@Override
public boolean isEmpty()
{
return size() == 0;
}

@Override
public boolean containsKey(Object key)
{
return containsKey(key.toString());
}

/**
* 是否包含key
*
* @param key
* @return
*/
public boolean containsKey(String key)
{
return exactMatchSearch(key) != -1;
}

@Override
public boolean containsValue(Object value)
{
return false;
}

@Override
public V get(Object key)
{
return get(key.toString());
}

@Override
public int build(TreeMap<String, V> keyValueMap)
{
int size = keyValueMap.size();
int[] indexArray = new int[size];
valueArray = (V[]) keyValueMap.values().toArray();
List<String> keyList = new ArrayList<String>(size);
int i = 0;
for (Entry<String, V> entry : keyValueMap.entrySet())
{
indexArray[i] = i;
valueArray[i] = entry.getValue();
keyList.add(entry.getKey());
++i;
}
build(keyList, indexArray);
return 0;
}

@Override
public boolean save(DataOutputStream out)
{
return false;
}

@Override
public boolean load(ByteArray byteArray, V[] value)
{
return false;
}

@Override
public V get(char[] key)
{
return get(new String(key));
}

public V get(String key)
{
int id = exactMatchSearch(key);
if (id == -1) return null;
return valueArray[id];
}

@Override
public V[] getValueArray(V[] a)
{
return valueArray;
}

/**
* 前缀查询
* @param key
* @param offset
* @param maxResults
* @return
*/
public ArrayList<Pair<String, V>> commonPrefixSearch(String key, int offset, int maxResults)
{
byte[] keyBytes = key.getBytes(utf8);
List<Pair<Integer, Integer>> pairList = commonPrefixSearch(keyBytes, offset, maxResults);
ArrayList<Pair<String, V>> resultList = new ArrayList<Pair<String, V>>(pairList.size());
for (Pair<Integer, Integer> pair : pairList)
{
resultList.add(new Pair<String, V>(new String(keyBytes, 0, pair.first), valueArray[pair.second]));
}
return resultList;
}

public ArrayList<Pair<String, V>> commonPrefixSearch(String key)
{
return commonPrefixSearch(key, 0, Integer.MAX_VALUE);
}

@Override
public V put(String key, V value)
{
throw new UnsupportedOperationException("双数组不支持增量式插入");
}

@Override
public V remove(Object key)
{
throw new UnsupportedOperationException("双数组不支持删除");
}

@Override
public void putAll(Map<? extends String, ? extends V> m)
{
throw new UnsupportedOperationException("双数组不支持增量式插入");
}

@Override
public void clear()
{
throw new UnsupportedOperationException("双数组不支持");
}

@Override
public Set<String> keySet()
{
throw new UnsupportedOperationException("双数组不支持");
}

@Override
public Collection<V> values()
{
return Arrays.asList(valueArray);
}

@Override
public Set<Entry<String, V>> entrySet()
{
throw new UnsupportedOperationException("双数组不支持");
}
}

+ 224
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/DoubleArray.java View File

@@ -0,0 +1,224 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone;


import com.hankcs.hanlp.collection.dartsclone.details.DoubleArrayBuilder;
import com.hankcs.hanlp.collection.dartsclone.details.Keyset;

import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
* 双数组DAWG
*
* @author manabe
*/
public class DoubleArray implements Serializable
{
static Charset utf8 = Charset.forName("UTF-8");

/**
* 构建
*
* @param keys 字节形式的键
* @param values 值
*/
public void build(byte[][] keys, int[] values)
{
Keyset keyset = new Keyset(keys, values);
DoubleArrayBuilder builder = new DoubleArrayBuilder();
builder.build(keyset);

_array = builder.copy();
}

public void build(List<String> keys, int[] values)
{
byte[][] byteKey = new byte[keys.size()][];
Iterator<String> iteratorKey = keys.iterator();
int i = 0;
while (iteratorKey.hasNext())
{
byteKey[i] = iteratorKey.next().getBytes(utf8);
++i;
}
build(byteKey, values);
}

/**
* Read from a stream. The stream must implement the available() method.
*
* @param stream
* @throws java.io.IOException
*/
public void open(InputStream stream) throws IOException
{

int size = (int) (stream.available() / UNIT_SIZE);
_array = new int[size];

DataInputStream in = null;
try
{
in = new DataInputStream(new BufferedInputStream(
stream));
for (int i = 0; i < size; ++i)
{
_array[i] = in.readInt();
}
}
finally
{
if (in != null)
{
in.close();
}
}
}

/**
* Saves the trie data into a stream.
*
* @param stream
* @throws java.io.IOException
*/
public void save(OutputStream stream) throws IOException
{
DataOutputStream out = null;
try
{
out = new DataOutputStream(new BufferedOutputStream(
stream));
for (int i = 0; i < _array.length; ++i)
{
out.writeInt(_array[i]);
}
}
finally
{
if (out != null)
{
out.close();
}
}
}

private void writeObject(ObjectOutputStream out) throws IOException
{
out.writeObject(_array);
}

private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException
{
_array = (int[]) in.readObject();
}

/**
* Returns the corresponding value if the key is found. Otherwise returns -1.
* This method converts the key into UTF-8.
*
* @param key search key
* @return found value
*/
public int exactMatchSearch(String key)
{
return exactMatchSearch(key.getBytes(utf8));
}

/**
* Returns the corresponding value if the key is found. Otherwise returns -1.
*
* @param key search key
* @return found value
*/
public int exactMatchSearch(byte[] key)
{
int unit = _array[0];
int nodePos = 0;

for (byte b : key)
{
// nodePos ^= unit.offset() ^ b
nodePos ^= ((unit >>> 10) << ((unit & (1 << 9)) >>> 6)) ^ (b & 0xFF);
unit = _array[nodePos];
// if (unit.label() != b)
if ((unit & ((1 << 31) | 0xFF)) != (b & 0xff))
{
return -1;
}
}
// if (!unit.has_leaf()) {
if (((unit >>> 8) & 1) != 1)
{
return -1;
}
// unit = _array[nodePos ^ unit.offset()];
unit = _array[nodePos ^ ((unit >>> 10) << ((unit & (1 << 9)) >>> 6))];
// return unit.value();
return unit & ((1 << 31) - 1);
}

/**
* Returns the keys that begins with the given key and its corresponding values.
* The first of the returned pair represents the length of the found key.
*
* @param key
* @param offset
* @param maxResults
* @return found keys and values
*/
public List<Pair<Integer, Integer>> commonPrefixSearch(byte[] key,
int offset,
int maxResults)
{
ArrayList<Pair<Integer, Integer>> result = new ArrayList<Pair<Integer, Integer>>();
int unit = _array[0];
int nodePos = 0;
// nodePos ^= unit.offset();
nodePos ^= ((unit >>> 10) << ((unit & (1 << 9)) >>> 6));
for (int i = offset; i < key.length; ++i)
{
byte b = key[i];
nodePos ^= (b & 0xff);
unit = _array[nodePos];
// if (unit.label() != b) {
if ((unit & ((1 << 31) | 0xFF)) != (b & 0xff))
{
return result;
}

// nodePos ^= unit.offset();
nodePos ^= ((unit >>> 10) << ((unit & (1 << 9)) >>> 6));

// if (unit.has_leaf()) {
if (((unit >>> 8) & 1) == 1)
{
if (result.size() < maxResults)
{
// result.add(new Pair<i, _array[nodePos].value());
result.add(new Pair<Integer, Integer>(i + 1, _array[nodePos] & ((1 << 31) - 1)));
}
}
}
return result;
}

/**
* 大小
*
* @return
*/
public int size()
{
return _array.length;
}

private static final int UNIT_SIZE = 4; // sizeof(int)
private int[] _array;
}

+ 47
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/Pair.java View File

@@ -0,0 +1,47 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone;

/**
* 模拟C++中的pair,也兼容JavaFX中的Pair
* @author manabe
*/
public class Pair<T, U>
{
public final T first;
public final U second;

public Pair(T first, U second)
{
this.first = first;
this.second = second;
}

public T getFirst()
{
return first;
}

public T getKey()
{
return first;
}

public U getSecond()
{
return second;
}

public U getValue()
{
return second;
}

@Override
public String toString()
{
return first + "=" + second;
}
}

+ 176
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/AutoBytePool.java View File

@@ -0,0 +1,176 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone.details;

/**
* 动态数组<br>
* Memory management of resizable array.
*
* @author
*/
class AutoBytePool
{
/**
* 获取缓冲区
* @return 缓冲区
*/
byte[] getBuffer()
{
return _buf;
}

/**
* 取字节
* @param id 字节下标
* @return 字节
*/
byte get(int id)
{
return _buf[id];
}

/**
* 设置值
* @param id 下标
* @param value 值
*/
void set(int id, byte value)
{
_buf[id] = value;
}

/**
* 是否为空
* @return true表示为空
*/
boolean empty()
{
return (_size == 0);
}

/**
* 缓冲区大小
* @return 大小
*/
int size()
{
return _size;
}

/**
* 清空缓存
*/
void clear()
{
resize(0);
_buf = null;
_size = 0;
_capacity = 0;
}

/**
* 在末尾加一个值
* @param value 值
*/
void add(byte value)
{
if (_size == _capacity)
{
resizeBuf(_size + 1);
}
_buf[_size++] = value;
}

/**
* 将最后一个值去掉
*/
void deleteLast()
{
--_size;
}

/**
* 重设大小
* @param size 大小
*/
void resize(int size)
{
if (size > _capacity)
{
resizeBuf(size);
}
_size = size;
}

/**
* 重设大小,并且在末尾加一个值
* @param size 大小
* @param value 值
*/
void resize(int size, byte value)
{
if (size > _capacity)
{
resizeBuf(size);
}
while (_size < size)
{
_buf[_size++] = value;
}
}

/**
* 增加容量
* @param size 容量
*/
void reserve(int size)
{
if (size > _capacity)
{
resizeBuf(size);
}
}

/**
* 设置缓冲区大小
* @param size 大小
*/
private void resizeBuf(int size)
{
int capacity;
if (size >= _capacity * 2)
{
capacity = size;
}
else
{
capacity = 1;
while (capacity < size)
{
capacity <<= 1;
}
}
byte[] buf = new byte[capacity];
if (_size > 0)
{
System.arraycopy(_buf, 0, buf, 0, _size);
}
_buf = buf;
_capacity = capacity;
}

/**
* 缓冲区
*/
private byte[] _buf;
/**
* 大小
*/
private int _size;
/**
* 容量
*/
private int _capacity;
}

+ 118
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/AutoIntPool.java View File

@@ -0,0 +1,118 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone.details;

/**
* 整型动态数组<br>
* Memory management of resizable array.
*
* @author
*/
class AutoIntPool
{
int[] getBuffer()
{
return _buf;
}

int get(int id)
{
return _buf[id];
}

void set(int id, int value)
{
_buf[id] = value;
}

boolean empty()
{
return (_size == 0);
}

int size()
{
return _size;
}

void clear()
{
resize(0);
_buf = null;
_size = 0;
_capacity = 0;
}

void add(int value)
{
if (_size == _capacity)
{
resizeBuf(_size + 1);
}
_buf[_size++] = value;
}

void deleteLast()
{
--_size;
}

void resize(int size)
{
if (size > _capacity)
{
resizeBuf(size);
}
_size = size;
}

void resize(int size, int value)
{
if (size > _capacity)
{
resizeBuf(size);
}
while (_size < size)
{
_buf[_size++] = value;
}
}

void reserve(int size)
{
if (size > _capacity)
{
resizeBuf(size);
}
}

private void resizeBuf(int size)
{
int capacity;
if (size >= _capacity * 2)
{
capacity = size;
}
else
{
capacity = 1;
while (capacity < size)
{
capacity <<= 1;
}
}
int[] buf = new int[capacity];
if (_size > 0)
{
System.arraycopy(_buf, 0, buf, 0, _size);
}
_buf = buf;
_capacity = capacity;
}

private int[] _buf;
private int _size;
private int _capacity;
}

+ 148
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/BitVector.java View File

@@ -0,0 +1,148 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone.details;

/**
* Bit向量,类似于C++中的bitset
* @author
*/
class BitVector
{
/**
* 获取某一位的比特
* @param id 位
* @return 比特是1还是0
*/
boolean get(int id)
{
return (_units.get(id / UNIT_SIZE) >>> (id % UNIT_SIZE) & 1) == 1;
}

/**
* 设置某一位的比特
* @param id 位
* @param bit 比特
*/
void set(int id, boolean bit)
{
if (bit)
{
_units.set(id / UNIT_SIZE, _units.get(id / UNIT_SIZE)
| 1 << (id % UNIT_SIZE));
}
}

/**
*
* @param id
* @return
*/
int rank(int id)
{
int unit_id = id / UNIT_SIZE;
return _ranks[unit_id] + popCount(_units.get(unit_id)
& (~0 >>> (UNIT_SIZE - (id % UNIT_SIZE) - 1)));
}

/**
* 是否为空
* @return
*/
boolean empty()
{
return _units.empty();
}

/**
* 1的数量
* @return
*/
int numOnes()
{
return _numOnes;
}

/**
* 大小
* @return
*/
int size()
{
return _size;
}

/**
* 在末尾追加
*/
void append()
{
if ((_size % UNIT_SIZE) == 0)
{
_units.add(0);
}
++_size;
}

/**
* 构建
*/
void build()
{
_ranks = new int[_units.size()];

_numOnes = 0;
for (int i = 0; i < _units.size(); ++i)
{
_ranks[i] = _numOnes;
_numOnes += popCount(_units.get(i));
}
}

/**
* 清空
*/
void clear()
{
_units.clear();
_ranks = null;
}

/**
* 整型大小
*/
private static final int UNIT_SIZE = 32; // sizeof(int) * 8

/**
* 1的数量
* @param unit
* @return
*/
private static int popCount(int unit)
{
unit = ((unit & 0xAAAAAAAA) >>> 1) + (unit & 0x55555555);
unit = ((unit & 0xCCCCCCCC) >>> 2) + (unit & 0x33333333);
unit = ((unit >>> 4) + unit) & 0x0F0F0F0F;
unit += unit >>> 8;
unit += unit >>> 16;
return unit & 0xFF;
}

/**
* 储存空间
*/
private AutoIntPool _units = new AutoIntPool();
/**
* 是每个元素的1的个数的累加
*/
private int[] _ranks;
/**
* 1的数量
*/
private int _numOnes;
/**
* 大小
*/
private int _size;
}

+ 492
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/DawgBuilder.java View File

@@ -0,0 +1,492 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone.details;

import java.util.ArrayList;

/**
* 有向无环字图
* @author
*/
class DawgBuilder
{
/**
* 根节点id
* @return 0
*/
int root()
{
return 0;
}

/**
* 获取节点的孩子
* @param id 节点的id
* @return 孩子的id
*/
int child(int id)
{
// return _units.get(id).child();
return _units.get(id) >>> 2;
}

/**
* 获取兄弟节点
* @param id 兄弟节点的id
* @return 下一个兄弟节点的id,或者0表示没有兄弟节点
*/
int sibling(int id)
{
// return _units.get(id).hasSibling() ? (id + 1) : 0;
return ((_units.get(id) & 1) == 1) ? (id + 1) : 0;
}

/**
* 获取值
* @param id 节点id
* @return 节点的值
*/
int value(int id)
{
// return _units.get(id).value();
return _units.get(id) >>> 1;
}

/**
* 是否是叶子节点
* @param id 节点id
* @return 是否是叶子节点
*/
boolean isLeaf(int id)
{
return label(id) == 0;
}

/**
* 获取label
* @param id 节点的id
* @return
*/
byte label(int id)
{
return _labels.get(id);
}

/**
* 是否是分叉点
* @param id 节点id
* @return
*/
boolean isIntersection(int id)
{
return _isIntersections.get(id);
}

int intersectionId(int id)
{
return _isIntersections.rank(id) - 1;
}

int numIntersections()
{
return _isIntersections.numOnes();
}

int size()
{
return _units.size();
}

/**
* 初始化
*/
void init()
{
_table.resize(INITIAL_TABLE_SIZE, 0);

appendNode();
appendUnit();

_numStates = 1;

_nodes.get(0).label = (byte) 0xFF;
_nodeStack.add(0);
}

void finish()
{
flush(0);

_units.set(0, _nodes.get(0).unit());
_labels.set(0, _nodes.get(0).label);

_nodes.clear();
_table.clear();
_nodeStack.clear();
_recycleBin.clear();

_isIntersections.build();
}

void insert(byte[] key, int value)
{
if (value < 0)
{
throw new IllegalArgumentException(
"failed to insert key: negative value");
}
if (key.length == 0)
{
throw new IllegalArgumentException(
"failed to inset key: zero-length key");
}

int id = 0;
int keyPos = 0;

for (; keyPos <= key.length; ++keyPos)
{
int childId = _nodes.get(id).child;
if (childId == 0)
{
break;
}

byte keyLabel = keyPos < key.length ? key[keyPos] : 0;
if (keyPos < key.length && keyLabel == 0)
{
throw new IllegalArgumentException(
"failed to insert key: invalid null character");
}

byte unitLabel = _nodes.get(childId).label;
if ((keyLabel & 0xFF) < (unitLabel & 0xFF))
{
throw new IllegalArgumentException(
"failed to insert key: wrong key order");
}
else if ((keyLabel & 0xFF) > (unitLabel & 0xFF))
{
_nodes.get(childId).hasSibling = true;
flush(childId);
break;
}
id = childId;
}

if (keyPos > key.length)
{
return;
}

for (; keyPos <= key.length; ++keyPos)
{
byte keyLabel = (keyPos < key.length) ? key[keyPos] : 0;
int childId = appendNode();

DawgNode node = _nodes.get(id);
DawgNode child = _nodes.get(childId);

if (node.child == 0)
{
child.isState = true;
}
child.sibling = node.child;
child.label = keyLabel;
node.child = childId;
_nodeStack.add(childId);

id = childId;
}
_nodes.get(id).setValue(value);
}

void clear()
{
_nodes.clear();
_units.clear();
_labels.clear();
_isIntersections.clear();
_table.clear();
_nodeStack.clear();
_recycleBin.clear();
_numStates = 0;
}

static class DawgNode
{
int child;
int sibling;
byte label;
boolean isState;
boolean hasSibling;

void reset()
{
child = 0;
sibling = 0;
label = (byte) 0;
isState = false;
hasSibling = false;
}

int getValue()
{
return child;
}

void setValue(int value)
{
child = value;
}

int unit()
{
if (label == 0)
{
return (child << 1) | (hasSibling ? 1 : 0);
}
return (child << 2) | (isState ? 2 : 0) | (hasSibling ? 1 : 0);
}
}

private void flush(int id)
{
while (_nodeStack.get(_nodeStack.size() - 1) != id)
{
int nodeId = _nodeStack.get(_nodeStack.size() - 1);
_nodeStack.deleteLast();

if (_numStates >= _table.size() - (_table.size() >>> 2))
{
expandTable();
}

int numSiblings = 0;
for (int i = nodeId; i != 0; i = _nodes.get(i).sibling)
{
++numSiblings;
}

// make an array of length 1 to emulate pass-by-reference
int[] matchHashId = findNode(nodeId);
int matchId = matchHashId[0];
int hashId = matchHashId[1];

if (matchId != 0)
{
_isIntersections.set(matchId, true);
}
else
{
int unitId = 0;
for (int i = 0; i < numSiblings; ++i)
{
unitId = appendUnit();
}
for (int i = nodeId; i != 0; i = _nodes.get(i).sibling)
{
_units.set(unitId, _nodes.get(i).unit());
_labels.set(unitId, _nodes.get(i).label);
--unitId;
}
matchId = unitId + 1;
_table.set(hashId, matchId);
++_numStates;
}

for (int i = nodeId, next; i != 0; i = next)
{
next = _nodes.get(i).sibling;
freeNode(i);
}

_nodes.get(_nodeStack.get(_nodeStack.size() - 1)).child = matchId;
}
_nodeStack.deleteLast();
}

private void expandTable()
{
int tableSize = _table.size() << 1;
_table.clear();
_table.resize(tableSize, 0);

for (int id = 1; id < _units.size(); ++id)
{
// if (_labels.get(i) == 0 || _units.get(id).isState)) {
if (_labels.get(id) == 0 || (_units.get(id) & 2) == 2)
{
int[] ret = findUnit(id);
int hashId = ret[1];
_table.set(hashId, id);
}
}
}

private int[] findUnit(int id)
{
int[] ret = new int[2];
int hashId = hashUnit(id) % _table.size();
for (; ; hashId = (hashId + 1) % _table.size())
{
// Remainder adjustment.
if (hashId < 0)
{
hashId += _table.size();
}
int unitId = _table.get(hashId);
if (unitId == 0)
{
break;
}

// there must not be the same unit.
}
ret[1] = hashId;
return ret;
}

private int[] findNode(int nodeId)
{
int[] ret = new int[2];
int hashId = hashNode(nodeId) % _table.size();
for (; ; hashId = (hashId + 1) % _table.size())
{
// Remainder adjustment
if (hashId < 0)
{
hashId += _table.size();
}
int unitId = _table.get(hashId);
if (unitId == 0)
{
break;
}

if (areEqual(nodeId, unitId))
{
ret[0] = unitId;
ret[1] = hashId;
return ret;
}
}
ret[1] = hashId;
return ret;
}

private boolean areEqual(int nodeId, int unitId)
{
for (int i = _nodes.get(nodeId).sibling; i != 0;
i = _nodes.get(i).sibling)
{
// if (_units.get(unitId).hasSibling() == false) {
if ((_units.get(unitId) & 1) != 1)
{
return false;
}
++unitId;
}
// if (_units.get(unitId).hasSibling() == true) {
if ((_units.get(unitId) & 1) == 1)
{
return false;
}

for (int i = nodeId; i != 0; i = _nodes.get(i).sibling, --unitId)
{
// if (_nodes.get(i) != _units.get(unitId).unit() ||
if (_nodes.get(i).unit() != _units.get(unitId) ||
_nodes.get(i).label != _labels.get(unitId))
{
return false;
}
}
return true;
}

private int hashUnit(int id)
{
int hashValue = 0;
for (; id != 0; ++id)
{
// int unit = _units.get(id).unit();
int unit = _units.get(id);
byte label = _labels.get(id);
hashValue ^= hash(((label & 0xFF) << 24) ^ unit);

// if (_units.get(id).hasSibling() == false) {
if ((_units.get(id) & 1) != 1)
{
break;
}
}
return hashValue;
}

private int hashNode(int id)
{
int hashValue = 0;
for (; id != 0; id = _nodes.get(id).sibling)
{
int unit = _nodes.get(id).unit();
byte label = _nodes.get(id).label;
hashValue ^= hash(((label & 0xFF) << 24) ^ unit);
}
return hashValue;
}

private int appendUnit()
{
_isIntersections.append();
_units.add(0);
_labels.add((byte) 0);

return _isIntersections.size() - 1;
}

private int appendNode()
{
int id;
if (_recycleBin.empty())
{
id = _nodes.size();
_nodes.add(new DawgNode());
}
else
{
id = _recycleBin.get(_recycleBin.size() - 1);
_nodes.get(id).reset();
_recycleBin.deleteLast();
}
return id;
}

private void freeNode(int id)
{
_recycleBin.add(id);
}

private static int hash(int key)
{
key = ~key + (key << 15); // key = (key << 15) - key - 1;
key = key ^ (key >>> 12);
key = key + (key << 2);
key = key ^ (key >>> 4);
key = key * 2057; // key = (key + (key << 3)) + (key << 11);
key = key ^ (key >>> 16);
return key;
}

private static final int INITIAL_TABLE_SIZE = 1 << 10;
private ArrayList<DawgNode> _nodes = new ArrayList<DawgNode>();
private AutoIntPool _units = new AutoIntPool();
private AutoBytePool _labels = new AutoBytePool();
private BitVector _isIntersections = new BitVector();
private AutoIntPool _table = new AutoIntPool();
private AutoIntPool _nodeStack = new AutoIntPool();
private AutoIntPool _recycleBin = new AutoIntPool();
private int _numStates;
}

+ 525
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/DoubleArrayBuilder.java View File

@@ -0,0 +1,525 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone.details;

/**
* 双数组构建者
*
* @author
*/
public class DoubleArrayBuilder
{
/**
* 构建
* @param keyset
*/
public void build(Keyset keyset)
{
if (keyset.hasValues())
{
DawgBuilder dawgBuilder = new DawgBuilder();
buildDawg(keyset, dawgBuilder);
buildFromDawg(dawgBuilder);
dawgBuilder.clear();
}
else
{
buildFromKeyset(keyset);
}
}

public int[] copy()
{
int[] ret = new int[_units.size()];
System.arraycopy(_units.getBuffer(), 0, ret, 0, _units.size());
return ret;
}

void clear()
{
_units = null;
_extras = null;
_labels.clear();
_table = null;
_extrasHead = 0;
}

private static final int BLOCK_SIZE = 256;
private static final int NUM_EXTRA_BLOCKS = 16;
private static final int NUM_EXTRAS = BLOCK_SIZE * NUM_EXTRA_BLOCKS;

private static final int UPPER_MASK = 0xFF << 21;
private static final int LOWER_MASK = 0xFF;

private static final int OFFSET_MASK = (1 << 31) | (1 << 8) | 0xFF;

static class DoubleArrayBuilderExtraUnit
{
int prev;
int next;
boolean isFixed;
boolean isUsed;
}

private int numBlocks()
{
return _units.size() / BLOCK_SIZE;
}

private DoubleArrayBuilderExtraUnit extras(int id)
{
return _extras[id % NUM_EXTRAS];
}

/**
* 构建
* @param keyset
* @param dawgBuilder
*/
private void buildDawg(Keyset keyset, DawgBuilder dawgBuilder)
{
dawgBuilder.init();
for (int i = 0; i < keyset.numKeys(); ++i)
{
dawgBuilder.insert(keyset.getKey(i), keyset.getValue(i));
}
dawgBuilder.finish();
}

private void buildFromDawg(DawgBuilder dawg)
{
int numUnits = 1;
while (numUnits < dawg.size())
{
numUnits <<= 1;
}
_units.reserve(numUnits);

_table = new int[dawg.numIntersections()];
_extras = new DoubleArrayBuilderExtraUnit[NUM_EXTRAS];
for (int i = 0; i < _extras.length; ++i)
{
_extras[i] = new DoubleArrayBuilderExtraUnit();
}

reserveId(0);


int[] units = _units.getBuffer();
// _units[0].set_offset(1);
units[0] |= 1 << 10;
// _units[0].set_label(0);
units[0] &= ~0xFF;

if (dawg.child(dawg.root()) != 0)
{
buildFromDawg(dawg, dawg.root(), 0);
}

fixAllBlocks();

_extras = null;
_labels.clear();
_table = null;
}

private void buildFromDawg(DawgBuilder dawg, int dawgId, int dictId)
{
int dawgChildId = dawg.child(dawgId);
if (dawg.isIntersection(dawgChildId))
{
int intersectionId = dawg.intersectionId(dawgChildId);
int offset = _table[intersectionId];
int[] units = _units.getBuffer();
if (offset != 0)
{
offset ^= dictId;
if ((offset & UPPER_MASK) == 0 || (offset & LOWER_MASK) == 0)
{
if (dawg.isLeaf(dawgChildId))
{
// units[dictId].setHasLeaf(true);
units[dictId] |= 1 << 8;
}
// units[dictId].setOffset(offset);
units[dictId] &= OFFSET_MASK;
units[dictId] |=
(offset < 1 << 21)
? offset << 10
: (offset << 2) | (1 << 9);
return;
}
}
}

int offset = arrangeFromDawg(dawg, dawgId, dictId);
if (dawg.isIntersection(dawgChildId))
{
_table[dawg.intersectionId(dawgChildId)] = offset;
}

do
{
byte childLabel = dawg.label(dawgChildId);
int dictChildId = offset ^ (childLabel & 0xFF);
if (childLabel != 0)
{
buildFromDawg(dawg, dawgChildId, dictChildId);
}
dawgChildId = dawg.sibling(dawgChildId);
}
while (dawgChildId != 0);
}

private int arrangeFromDawg(DawgBuilder dawg, int dawgId, int dictId)
{
_labels.resize(0);

int dawgChildId = dawg.child(dawgId);
while (dawgChildId != 0)
{
_labels.add(dawg.label(dawgChildId));
dawgChildId = dawg.sibling(dawgChildId);
}

int offset = findValidOffset(dictId);
int[] units = _units.getBuffer();
// units[dictId].setOffset(dic_id ^ offset);
units[dictId] &= OFFSET_MASK;
int newId = dictId ^ offset;
units[dictId] |=
(newId < 1 << 21)
? newId << 10
: (newId << 2) | (1 << 9);

dawgChildId = dawg.child(dawgId);
for (int i = 0; i < _labels.size(); ++i)
{
int dictChildId = offset ^ (_labels.get(i) & 0xFF);
reserveId(dictChildId);
units = _units.getBuffer();

if (dawg.isLeaf(dawgChildId))
{
// units[dictId].setHasLeaf(true);
units[dictId] |= 1 << 8;
// units[dictChildId].setValue(dawg.value(dawgChildId));
units[dictChildId] = dawg.value(dawgChildId) | (1 << 31);
}
else
{
// units[dictChildId].setLabel(_labels[i]);
units[dictChildId] = (units[dictChildId] & ~0xFF)
| (_labels.get(i) & 0xFF);
}

dawgChildId = dawg.sibling(dawgChildId);
}
extras(offset).isUsed = true;

return offset;
}

private void buildFromKeyset(Keyset keyset)
{
int numUnits = 1;
while (numUnits < keyset.numKeys())
{
numUnits <<= 1;
}
_units.reserve(numUnits);

_extras = new DoubleArrayBuilderExtraUnit[NUM_EXTRAS];
for (int i = 0; i < _extras.length; ++i)
{
_extras[i] = new DoubleArrayBuilderExtraUnit();
}

reserveId(0);
extras(0).isUsed = true;

int[] units = _units.getBuffer();
// units[0].setOffset(1);
units[0] |= 1 << 10;
// units[0].setLabel(0);
units[0] &= ~0xFF;

if (keyset.numKeys() > 0)
{
buildFromKeyset(keyset, 0, keyset.numKeys(), 0, 0);
}

fixAllBlocks();

_extras = null;
_labels.clear();
}

private void buildFromKeyset(Keyset keyset, int begin, int end, int depth,
int dicId)
{
int offset = arrangeFromKeyset(keyset, begin, end, depth, dicId);

while (begin < end)
{
if (keyset.getKeyByte(begin, depth) != 0)
{
break;
}
++begin;
}
if (begin == end)
{
return;
}

int lastBegin = begin;
byte lastLabel = keyset.getKeyByte(begin, depth);
while (++begin < end)
{
byte label = keyset.getKeyByte(begin, depth);
if (label != lastLabel)
{
buildFromKeyset(keyset, lastBegin, begin, depth + 1,
offset ^ (lastLabel & 0xFF));
lastBegin = begin;
lastLabel = keyset.getKeyByte(begin, depth);
}
}
buildFromKeyset(keyset, lastBegin, end, depth + 1, offset ^ (lastLabel & 0xFF));
}

private int arrangeFromKeyset(Keyset keyset, int begin, int end, int depth,
int dictId)
{
_labels.resize(0);

int value = -1;
for (int i = begin; i < end; ++i)
{
byte label = keyset.getKeyByte(i, depth);
if (label == 0)
{
if (depth < keyset.getKey(i).length)
{
throw new IllegalArgumentException(
"failed to build double-array: " +
"invalid null character");
}
else if (keyset.getValue(i) < 0)
{
throw new IllegalArgumentException(
"failed to build double-array: negative value");
}

if (value == -1)
{
value = keyset.getValue(i);
}
}

if (_labels.empty())
{
_labels.add(label);
}
else if (label != _labels.get(_labels.size() - 1))
{
if ((label & 0xFF) < (_labels.get(_labels.size() - 1) & 0xFF))
{
throw new IllegalArgumentException(
"failed to build double-array: wrong key order");
}
_labels.add(label);
}
}

int offset = findValidOffset(dictId);
int[] units = _units.getBuffer();
// units[dictId].setOffset(dictIad ^ offset);
units[dictId] &= OFFSET_MASK;
int newId = dictId ^ offset;
units[dictId] |=
(newId < 1 << 21)
? newId << 10
: (newId << 2) | (1 << 9);

for (int i = 0; i < _labels.size(); ++i)
{
int dictChildId = offset ^ (_labels.get(i) & 0xFF);
reserveId(dictChildId);
units = _units.getBuffer();
if (_labels.get(i) == 0)
{
// units[dictId].setHasLeaf(true);
units[dictId] |= 1 << 8;
// units[dictChildId].setValue(value);
units[dictChildId] = value | (1 << 31);
}
else
{
// units[dictChildId].setLabel(_labels[i]);
units[dictChildId] = (units[dictChildId] & ~0xFF)
| (_labels.get(i) & 0xFF);
}
}
extras(offset).isUsed = true;

return offset;
}

int findValidOffset(int id)
{
if (_extrasHead >= _units.size())
{
return _units.size() | (id & LOWER_MASK);
}

int unfixedId = _extrasHead;
do
{
int offset = unfixedId ^ (_labels.get(0) & 0xFF);
if (isValidOffset(id, offset))
{
return offset;
}
unfixedId = extras(unfixedId).next;
}
while (unfixedId != _extrasHead);
return _units.size() | (id & LOWER_MASK);
}

boolean isValidOffset(int id, int offset)
{
if (extras(offset).isUsed)
{
return false;
}

int relOffset = id ^ offset;
if ((relOffset & LOWER_MASK) != 0 && (relOffset & UPPER_MASK) != 0)
{
return false;
}

for (int i = 1; i < _labels.size(); ++i)
{
if (extras(offset ^ (_labels.get(i) & 0xFF)).isFixed)
{
return false;
}
}

return true;
}

void reserveId(int id)
{
if (id >= _units.size())
{
expandUnits();
}

if (id == _extrasHead)
{
_extrasHead = extras(id).next;
if (_extrasHead == id)
{
_extrasHead = _units.size();
}
}
extras(extras(id).prev).next = extras(id).next;
extras(extras(id).next).prev = extras(id).prev;
extras(id).isFixed = true;
}

void expandUnits()
{
int srcNumUnits = _units.size();
int srcNumBlocks = numBlocks();

int destNumUnits = srcNumUnits + BLOCK_SIZE;
int destNumBlocks = srcNumBlocks + 1;

if (destNumBlocks > NUM_EXTRA_BLOCKS)
{
fixBlock(srcNumBlocks - NUM_EXTRA_BLOCKS);
}

_units.resize(destNumUnits);

if (destNumBlocks > NUM_EXTRA_BLOCKS)
{
for (int id = srcNumUnits; id < destNumUnits; ++id)
{
extras(id).isUsed = false;
extras(id).isFixed = false;
}
}

for (int i = srcNumUnits + 1; i < destNumUnits; ++i)
{
extras(i - 1).next = i;
extras(i).prev = i - 1;
}

extras(srcNumUnits).prev = destNumUnits - 1;
extras(destNumUnits - 1).next = srcNumUnits;

extras(srcNumUnits).prev = extras(_extrasHead).prev;
extras(destNumUnits - 1).next = _extrasHead;

extras(extras(_extrasHead).prev).next = srcNumUnits;
extras(_extrasHead).prev = destNumUnits - 1;
}

void fixAllBlocks()
{
int begin = 0;
if (numBlocks() > NUM_EXTRA_BLOCKS)
{
begin = numBlocks() - NUM_EXTRA_BLOCKS;
}
int end = numBlocks();

for (int blockId = begin; blockId != end; ++blockId)
{
fixBlock(blockId);
}
}

void fixBlock(int blockId)
{
int begin = blockId * BLOCK_SIZE;
int end = begin + BLOCK_SIZE;

int unusedOffset = 0;
for (int offset = begin; offset != end; ++offset)
{
if (!extras(offset).isUsed)
{
unusedOffset = offset;
break;
}
}

for (int id = begin; id != end; ++id)
{
if (!extras(id).isFixed)
{
reserveId(id);
int[] units = _units.getBuffer();
// units[id].setLabel(id ^ unused_offset);
units[id] = (units[id] & ~0xFF)
| ((id ^ unusedOffset) & 0xFF);
}
}
}

private AutoIntPool _units = new AutoIntPool();
private DoubleArrayBuilderExtraUnit[] _extras;
private AutoBytePool _labels = new AutoBytePool();
private int[] _table;
private int _extrasHead;
}

+ 89
- 0
core/src/main/java/com/hankcs/hanlp/collection/dartsclone/details/Keyset.java View File

@@ -0,0 +1,89 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package com.hankcs.hanlp.collection.dartsclone.details;

/**
* key set,其实也包含值(每个key都有一个整型数)
* @author manabe
*/
public class Keyset
{
/**
* 构造一个KeySet
* @param keys 字节类型的key
* @param values 每个key对应的值
*/
public Keyset(byte[][] keys, int[] values)
{
_keys = keys;
_values = values;
}

/**
* keyset的容量
* @return
*/
int numKeys()
{
return _keys.length;
}

/**
* 根据id获取key
* @param id
* @return
*/
byte[] getKey(int id)
{
return _keys[id];
}

/**
* 获取某个key的某一个字节
* @param keyId key的id
* @param byteId 字节的下标(第几个字节)
* @return 字节,返回0表示越界了
*/
byte getKeyByte(int keyId, int byteId)
{
if (byteId >= _keys[keyId].length)
{
return 0;
}
return _keys[keyId][byteId];
}

/**
* 是否含有值
* @return
*/
boolean hasValues()
{
return _values != null;
}

/**
* 根据下标获取值
* @param id
* @return
*/
int getValue(int id)
{
if (hasValues())
{
return _values[id];
}
return id;
}

/**
* 键
*/
private byte[][] _keys;
/**
* 值
*/
private int _values[];
}

+ 152
- 0
core/src/main/java/com/hankcs/hanlp/collection/sequence/SString.java View File

@@ -0,0 +1,152 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/12/5 19:35</create-date>
*
* <copyright file="CharArray.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.sequence;

import java.util.Arrays;

/**
* (SimpleString)字符串,因为String内部的char[]无法访问,而许多任务经常操作char[],所以封装了这个结构。
*
* @author hankcs
*/
public class SString implements Comparable<SString>, CharSequence
{
public char[] value;
/**
* 开始位置,包含
*/
public int b;
/**
* 结束位置,不包含
*/
public int e;

/**
* 建立一个字符串
*
* @param value
* @param b
* @param e
*/
public SString(char[] value, int b, int e)
{
this.value = value;
this.b = b;
this.e = e;
}

public SString(String s)
{
value = s.toCharArray();
b = 0;
e = s.length();
}

@Override
public boolean equals(Object anObject)
{
if (this == anObject)
{
return true;
}
if (anObject instanceof SString)
{
SString anotherString = (SString) anObject;
int n = value.length;
if (n == anotherString.value.length)
{
char v1[] = value;
char v2[] = anotherString.value;
int i = 0;
while (n-- != 0)
{
if (v1[i] != v2[i])
return false;
i++;
}
return true;
}
}
return false;
}

@Override
public int length()
{
return e - b;
}

@Override
public char charAt(int index)
{
return value[b + index];
}

@Override
public CharSequence subSequence(int start, int end)
{
return new SString(value, b + start, b + end);
}

@Override
public String toString()
{
return new String(value, b, e - b);
}

@Override
public int compareTo(SString anotherString)
{
int len1 = value.length;
int len2 = anotherString.value.length;
int lim = Math.min(len1, len2);
char v1[] = value;
char v2[] = anotherString.value;

int k = 0;
while (k < lim)
{
char c1 = v1[k];
char c2 = v2[k];
if (c1 != c2)
{
return c1 - c2;
}
k++;
}
return len1 - len2;
}

public char[] toCharArray()
{
return Arrays.copyOfRange(value, b, e);
}

public static SString valueOf(char word)
{
SString s = new SString(new char[]{word}, 0, 1);

return s;
}

public SString add(SString other)
{
char[] value = new char[length() + other.length()];
System.arraycopy(this.value, b, value, 0, length());
System.arraycopy(other.value, other.b, value, length(), other.length());
b = 0;
e = length() + other.length();
this.value = value;

return this;
}
}

+ 29
- 0
core/src/main/java/com/hankcs/hanlp/collection/set/UnEmptyStringSet.java View File

@@ -0,0 +1,29 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/2 12:08</create-date>
*
* <copyright file="UnEmptyStringSet.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.set;

import java.util.TreeSet;

/**
* 一个不接受空白的字符串set
* @author hankcs
*/
public class UnEmptyStringSet extends TreeSet<String>
{
@Override
public boolean add(String s)
{
if (s.trim().length() == 0) return false;

return super.add(s);
}
}

+ 1475
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/DoubleArrayTrie.java
File diff suppressed because it is too large
View File


+ 33
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/ITrie.java View File

@@ -0,0 +1,33 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2015/4/23 0:23</create-date>
*
* <copyright file="ITrie.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2015, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.trie;

import com.hankcs.hanlp.corpus.io.ByteArray;

import java.io.DataOutputStream;
import java.util.TreeMap;

/**
* trie树接口
* @author hankcs
*/
public interface ITrie<V>
{
int build(TreeMap<String, V> keyValueMap);
boolean save(DataOutputStream out);
boolean load(ByteArray byteArray, V[] value);
V get(char[] key);
V get(String key);
V[] getValueArray(V[] a);
boolean containsKey(String key);
int size();
}

+ 304
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BaseNode.java View File

@@ -0,0 +1,304 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/5/2 20:22</create-date>
*
* <copyright file="INode.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.trie.bintrie;

import com.hankcs.hanlp.corpus.io.ByteArray;

import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.util.AbstractMap;
import java.util.Map;
import java.util.Set;

/**
* 节点,统一Trie树根和其他节点的基类
*
* @param <V> 值
* @author He Han
*/
public abstract class BaseNode<V> implements Comparable<BaseNode>
{
/**
* 状态数组,方便读取的时候用
*/
static final Status[] ARRAY_STATUS = Status.values();
/**
* 子节点
*/
protected BaseNode[] child;
/**
* 节点状态
*/
protected Status status;
/**
* 节点代表的字符
*/
protected char c;
/**
* 节点代表的值
*/
protected V value;

public BaseNode<V> transition(String path, int begin)
{
BaseNode<V> cur = this;
for (int i = begin; i < path.length(); ++i)
{
cur = cur.getChild(path.charAt(i));
if (cur == null || cur.status == Status.UNDEFINED_0) return null;
}
return cur;
}

public BaseNode<V> transition(char[] path, int begin)
{
BaseNode<V> cur = this;
for (int i = begin; i < path.length; ++i)
{
cur = cur.getChild(path[i]);
if (cur == null || cur.status == Status.UNDEFINED_0) return null;
}
return cur;
}

/**
* 转移状态
* @param path
* @return
*/
public BaseNode<V> transition(char path)
{
BaseNode<V> cur = this;
cur = cur.getChild(path);
if (cur == null || cur.status == Status.UNDEFINED_0) return null;
return cur;
}

/**
* 添加子节点
*
* @return true-新增了节点 false-修改了现有节点
*/
protected abstract boolean addChild(BaseNode node);

/**
* 是否含有子节点
*
* @param c 子节点的char
* @return 是否含有
*/
protected boolean hasChild(char c)
{
return getChild(c) != null;
}

protected char getChar()
{
return c;
}

/**
* 获取子节点
*
* @param c 子节点的char
* @return 子节点
*/
public abstract BaseNode getChild(char c);

/**
* 获取节点对应的值
*
* @return 值
*/
public final V getValue()
{
return value;
}

/**
* 设置节点对应的值
*
* @param value 值
*/
public final void setValue(V value)
{
this.value = value;
}

@Override
public int compareTo(BaseNode other)
{
return compareTo(other.getChar());
}

/**
* 重载,与字符的比较
* @param other
* @return
*/
public int compareTo(char other)
{
if (this.c > other)
{
return 1;
}
if (this.c < other)
{
return -1;
}
return 0;
}

/**
* 获取节点的成词状态
* @return
*/
public Status getStatus()
{
return status;
}

protected void walk(StringBuilder sb, Set<Map.Entry<String, V>> entrySet)
{
sb.append(c);
if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3)
{
entrySet.add(new TrieEntry(sb.toString(), value));
}
if (child == null) return;
for (BaseNode node : child)
{
if (node == null) continue;
node.walk(new StringBuilder(sb.toString()), entrySet);
}
}

protected void walkToSave(DataOutputStream out) throws IOException
{
out.writeChar(c);
out.writeInt(status.ordinal());
int childSize = 0;
if (child != null) childSize = child.length;
out.writeInt(childSize);
if (child == null) return;
for (BaseNode node : child)
{
node.walkToSave(out);
}
}

protected void walkToSave(ObjectOutput out) throws IOException
{
out.writeChar(c);
out.writeInt(status.ordinal());
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2)
{
out.writeObject(value);
}
int childSize = 0;
if (child != null) childSize = child.length;
out.writeInt(childSize);
if (child == null) return;
for (BaseNode node : child)
{
node.walkToSave(out);
}
}

protected void walkToLoad(ByteArray byteArray, _ValueArray<V> valueArray)
{
c = byteArray.nextChar();
status = ARRAY_STATUS[byteArray.nextInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2)
{
value = valueArray.nextValue();
}
int childSize = byteArray.nextInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i)
{
child[i] = new Node<V>();
child[i].walkToLoad(byteArray, valueArray);
}
}

protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException
{
c = byteArray.readChar();
status = ARRAY_STATUS[byteArray.readInt()];
if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2)
{
value = (V) byteArray.readObject();
}
int childSize = byteArray.readInt();
child = new BaseNode[childSize];
for (int i = 0; i < childSize; ++i)
{
child[i] = new Node<V>();
child[i].walkToLoad(byteArray);
}
}

public enum Status
{
/**
* 未指定,用于删除词条
*/
UNDEFINED_0,
/**
* 不是词语的结尾
*/
NOT_WORD_1,
/**
* 是个词语的结尾,并且还可以继续
*/
WORD_MIDDLE_2,
/**
* 是个词语的结尾,并且没有继续
*/
WORD_END_3,
}

public class TrieEntry extends AbstractMap.SimpleEntry<String, V> implements Comparable<TrieEntry>
{
public TrieEntry(String key, V value)
{
super(key, value);
}
@Override
public int compareTo(TrieEntry o)
{
return getKey().compareTo(o.getKey());
}
}

@Override
public String toString()
{
if (child == null)
{
return "BaseNode{" +
"status=" + status +
", c=" + c +
", value=" + value +
'}';
}
return "BaseNode{" +
"child=" + child.length +
", status=" + status +
", c=" + c +
", value=" + value +
'}';
}
}

+ 668
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/BinTrie.java View File

@@ -0,0 +1,668 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/5/3 11:34</create-date>
*
* <copyright file="BinTrie.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.trie.bintrie;

import com.hankcs.hanlp.collection.AhoCorasick.AhoCorasickDoubleArrayTrie;
import com.hankcs.hanlp.collection.trie.ITrie;
import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.IOUtil;
import com.hankcs.hanlp.utility.TextUtility;

import java.io.*;
import java.util.*;

import static com.hankcs.hanlp.utility.Predefine.logger;

/**
* 首字直接分配内存,之后二分动态数组的Trie树,能够平衡时间和空间
*
* @author hankcs
*/
public class BinTrie<V> extends BaseNode<V> implements ITrie<V>, Externalizable
{
private int size;

public BinTrie()
{
child = new BaseNode[65535 + 1]; // (int)Character.MAX_VALUE
size = 0;
status = Status.NOT_WORD_1;
}

public BinTrie(Map<String, V> map)
{
this();
for (Map.Entry<String, V> entry : map.entrySet())
{
put(entry.getKey(), entry.getValue());
}
}

/**
* 插入一个词
*
* @param key
* @param value
*/
public void put(String key, V value)
{
if (key.length() == 0) return; // 安全起见
BaseNode branch = this;
char[] chars = key.toCharArray();
for (int i = 0; i < chars.length - 1; ++i)
{
// 除了最后一个字外,都是继续
branch.addChild(new Node(chars[i], Status.NOT_WORD_1, null));
branch = branch.getChild(chars[i]);
}
// 最后一个字加入时属性为end
if (branch.addChild(new Node<V>(chars[chars.length - 1], Status.WORD_END_3, value)))
{
++size; // 维护size
}
}

public void put(char[] key, V value)
{
BaseNode branch = this;
for (int i = 0; i < key.length - 1; ++i)
{
// 除了最后一个字外,都是继续
branch.addChild(new Node(key[i], Status.NOT_WORD_1, null));
branch = branch.getChild(key[i]);
}
// 最后一个字加入时属性为end
if (branch.addChild(new Node<V>(key[key.length - 1], Status.WORD_END_3, value)))
{
++size; // 维护size
}
}

/**
* 设置键值对,当键不存在的时候会自动插入
* @param key
* @param value
*/
public void set(String key, V value)
{
put(key.toCharArray(), value);
}

/**
* 删除一个词
*
* @param key
*/
public void remove(String key)
{
BaseNode branch = this;
char[] chars = key.toCharArray();
for (int i = 0; i < chars.length - 1; ++i)
{
if (branch == null) return;
branch = branch.getChild(chars[i]);
}
if (branch == null) return;
// 最后一个字设为undefined
if (branch.addChild(new Node(chars[chars.length - 1], Status.UNDEFINED_0, value)))
{
--size;
}
}

public boolean containsKey(String key)
{
BaseNode branch = this;
char[] chars = key.toCharArray();
for (char aChar : chars)
{
if (branch == null) return false;
branch = branch.getChild(aChar);
}

return branch != null && (branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2);
}

public V get(String key)
{
BaseNode branch = this;
char[] chars = key.toCharArray();
for (char aChar : chars)
{
if (branch == null) return null;
branch = branch.getChild(aChar);
}

if (branch == null) return null;
// 下面这句可以保证只有成词的节点被返回
if (!(branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2)) return null;
return (V) branch.getValue();
}

public V get(char[] key)
{
BaseNode branch = this;
for (char aChar : key)
{
if (branch == null) return null;
branch = branch.getChild(aChar);
}

if (branch == null) return null;
// 下面这句可以保证只有成词的节点被返回
if (!(branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2)) return null;
return (V) branch.getValue();
}

@Override
public V[] getValueArray(V[] a)
{
if (a.length < size)
a = (V[]) java.lang.reflect.Array.newInstance(
a.getClass().getComponentType(), size);
int i = 0;
for (Map.Entry<String, V> entry : entrySet())
{
a[i++] = entry.getValue();
}
return a;
}

/**
* 获取键值对集合
*
* @return
*/
public Set<Map.Entry<String, V>> entrySet()
{
Set<Map.Entry<String, V>> entrySet = new TreeSet<Map.Entry<String, V>>();
StringBuilder sb = new StringBuilder();
for (BaseNode node : child)
{
if (node == null) continue;
node.walk(new StringBuilder(sb.toString()), entrySet);
}
return entrySet;
}

/**
* 键集合
* @return
*/
public Set<String> keySet()
{
TreeSet<String> keySet = new TreeSet<String>();
for (Map.Entry<String, V> entry : entrySet())
{
keySet.add(entry.getKey());
}

return keySet;
}

/**
* 前缀查询
*
* @param key 查询串
* @return 键值对
*/
public Set<Map.Entry<String, V>> prefixSearch(String key)
{
Set<Map.Entry<String, V>> entrySet = new TreeSet<Map.Entry<String, V>>();
StringBuilder sb = new StringBuilder(key.substring(0, key.length() - 1));
BaseNode branch = this;
char[] chars = key.toCharArray();
for (char aChar : chars)
{
if (branch == null) return entrySet;
branch = branch.getChild(aChar);
}

if (branch == null) return entrySet;
branch.walk(sb, entrySet);
return entrySet;
}

/**
* 前缀查询,包含值
*
* @param key 键
* @return 键值对列表
*/
public LinkedList<Map.Entry<String, V>> commonPrefixSearchWithValue(String key)
{
char[] chars = key.toCharArray();
return commonPrefixSearchWithValue(chars, 0);
}

/**
* 前缀查询,通过字符数组来表示字符串可以优化运行速度
*
* @param chars 字符串的字符数组
* @param begin 开始的下标
* @return
*/
public LinkedList<Map.Entry<String, V>> commonPrefixSearchWithValue(char[] chars, int begin)
{
LinkedList<Map.Entry<String, V>> result = new LinkedList<Map.Entry<String, V>>();
StringBuilder sb = new StringBuilder();
BaseNode branch = this;
for (int i = begin; i < chars.length; ++i)
{
char aChar = chars[i];
branch = branch.getChild(aChar);
if (branch == null || branch.status == Status.UNDEFINED_0) return result;
sb.append(aChar);
if (branch.status == Status.WORD_MIDDLE_2 || branch.status == Status.WORD_END_3)
{
result.add(new AbstractMap.SimpleEntry<String, V>(sb.toString(), (V) branch.value));
}
}

return result;
}

@Override
protected boolean addChild(BaseNode node)
{
boolean add = false;
char c = node.getChar();
BaseNode target = getChild(c);
if (target == null)
{
child[c] = node;
add = true;
}
else
{
switch (node.status)
{
case UNDEFINED_0:
if (target.status != Status.NOT_WORD_1)
{
target.status = Status.NOT_WORD_1;
add = true;
}
break;
case NOT_WORD_1:
if (target.status == Status.WORD_END_3)
{
target.status = Status.WORD_MIDDLE_2;
}
break;
case WORD_END_3:
if (target.status == Status.NOT_WORD_1)
{
target.status = Status.WORD_MIDDLE_2;
}
if (target.getValue() == null)
{
add = true;
}
target.setValue(node.getValue());
break;
}
}
return add;
}

public int size()
{
return size;
}

@Override
protected char getChar()
{
return 0; // 根节点没有char
}

@Override
public BaseNode getChild(char c)
{
return child[c];
}

public boolean save(String path)
{
try
{
DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(path));
for (BaseNode node : child)
{
if (node == null)
{
out.writeInt(0);
}
else
{
out.writeInt(1);
node.walkToSave(out);
}
}
out.close();
}
catch (Exception e)
{
logger.warning("保存到" + path + "失败" + TextUtility.exceptionToString(e));
return false;
}

return true;
}

@Override
public int build(TreeMap<String, V> keyValueMap)
{
for (Map.Entry<String, V> entry : keyValueMap.entrySet())
{
put(entry.getKey(), entry.getValue());
}
return 0;
}

/**
* 保存到二进制输出流
*
* @param out
* @return
*/
public boolean save(DataOutputStream out)
{
try
{
for (BaseNode node : child)
{
if (node == null)
{
out.writeInt(0);
}
else
{
out.writeInt(1);
node.walkToSave(out);
}
}
}
catch (Exception e)
{
logger.warning("保存到" + out + "失败" + TextUtility.exceptionToString(e));
return false;
}

return true;
}

/**
* 从磁盘加载二分数组树
*
* @param path 路径
* @param value 额外提供的值数组,按照值的字典序。(之所以要求提供它,是因为泛型的保存不归树管理)
* @return 是否成功
*/
public boolean load(String path, V[] value)
{
byte[] bytes = IOUtil.readBytes(path);
if (bytes == null) return false;
_ValueArray valueArray = new _ValueArray(value);
ByteArray byteArray = new ByteArray(bytes);
for (int i = 0; i < child.length; ++i)
{
int flag = byteArray.nextInt();
if (flag == 1)
{
child[i] = new Node<V>();
child[i].walkToLoad(byteArray, valueArray);
}
}
size = value.length;

return true;
}

/**
* 只加载值,此时相当于一个set
*
* @param path
* @return
*/
public boolean load(String path)
{
byte[] bytes = IOUtil.readBytes(path);
if (bytes == null) return false;
_ValueArray valueArray = new _EmptyValueArray();
ByteArray byteArray = new ByteArray(bytes);
for (int i = 0; i < child.length; ++i)
{
int flag = byteArray.nextInt();
if (flag == 1)
{
child[i] = new Node<V>();
child[i].walkToLoad(byteArray, valueArray);
}
}
size = -1; // 不知道有多少

return true;
}

public boolean load(ByteArray byteArray, _ValueArray valueArray)
{
for (int i = 0; i < child.length; ++i)
{
int flag = byteArray.nextInt();
if (flag == 1)
{
child[i] = new Node<V>();
child[i].walkToLoad(byteArray, valueArray);
}
}
size = valueArray.value.length;

return true;
}

public boolean load(ByteArray byteArray, V[] value)
{
return load(byteArray, newValueArray().setValue(value));
}

public _ValueArray newValueArray()
{
return new _ValueArray();
}

@Override
public void writeExternal(ObjectOutput out) throws IOException
{
out.writeInt(size);
for (BaseNode node : child)
{
if (node == null)
{
out.writeInt(0);
}
else
{
out.writeInt(1);
node.walkToSave(out);
}
}
}

@Override
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException
{
size = in.readInt();
for (int i = 0; i < child.length; ++i)
{
int flag = in.readInt();
if (flag == 1)
{
child[i] = new Node<V>();
child[i].walkToLoad(in);
}
}
}

/**
* 最长匹配
*
* @param text 文本
* @param processor 处理器
*/
public void parseLongestText(String text, AhoCorasickDoubleArrayTrie.IHit<V> processor)
{
int length = text.length();
for (int i = 0; i < length; ++i)
{
BaseNode<V> state = transition(text.charAt(i));
if (state != null)
{
int to = i + 1;
int end = to;
V value = state.getValue();
for (; to < length; ++to)
{
state = state.transition(text.charAt(to));
if (state == null) break;
if (state.getValue() != null)
{
value = state.getValue();
end = to + 1;
}
}
if (value != null)
{
processor.hit(i, end, value);
i = end - 1;
}
}
}
}

/**
* 最长匹配
*
* @param text 文本
* @param processor 处理器
*/
public void parseLongestText(char[] text, AhoCorasickDoubleArrayTrie.IHit<V> processor)
{
int length = text.length;
for (int i = 0; i < length; ++i)
{
BaseNode<V> state = transition(text[i]);
if (state != null)
{
int to = i + 1;
int end = to;
V value = state.getValue();
for (; to < length; ++to)
{
state = state.transition(text[to]);
if (state == null) break;
if (state.getValue() != null)
{
value = state.getValue();
end = to + 1;
}
}
if (value != null)
{
processor.hit(i, end, value);
i = end - 1;
}
}
}
}

/**
* 匹配文本
*
* @param text 文本
* @param processor 处理器
*/
public void parseText(String text, AhoCorasickDoubleArrayTrie.IHit<V> processor)
{
int length = text.length();
int begin = 0;
BaseNode<V> state = this;

for (int i = begin; i < length; ++i)
{
state = state.transition(text.charAt(i));
if (state != null)
{
V value = state.getValue();
if (value != null)
{
processor.hit(begin, i + 1, value);
}

/*如果是最后一位,这里不能直接跳出循环, 要继续从下一个字符开始判断*/
if (i == length - 1)
{
i = begin;
++begin;
state = this;
}
}
else
{
i = begin;
++begin;
state = this;
}
}
}

/**
* 匹配文本
*
* @param text 文本
* @param processor 处理器
*/
public void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit<V> processor)
{
int length = text.length;
int begin = 0;
BaseNode<V> state = this;

for (int i = begin; i < length; ++i)
{
state = state.transition(text[i]);
if (state != null)
{
V value = state.getValue();
if (value != null)
{
processor.hit(begin, i + 1, value);
}

/*如果是最后一位,这里不能直接跳出循环, 要继续从下一个字符开始判断*/
if (i == length - 1)
{
i = begin;
++begin;
state = this;
}
}
else
{
i = begin;
++begin;
state = this;
}
}
}
}

+ 103
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/Node.java View File

@@ -0,0 +1,103 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/5/3 12:27</create-date>
*
* <copyright file="Node.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.trie.bintrie;


import com.hankcs.hanlp.collection.trie.bintrie.util.ArrayTool;

/**
* 深度大于等于2的子节点
*
* @author He Han
*/
public class Node<V> extends BaseNode
{
@Override
protected boolean addChild(BaseNode node)
{
boolean add = false;
if (child == null)
{
child = new BaseNode[0];
}
int index = ArrayTool.binarySearch(child, node);
if (index >= 0)
{
BaseNode target = child[index];
switch (node.status)
{
case UNDEFINED_0:
if (target.status != Status.NOT_WORD_1)
{
target.status = Status.NOT_WORD_1;
target.value = null;
add = true;
}
break;
case NOT_WORD_1:
if (target.status == Status.WORD_END_3)
{
target.status = Status.WORD_MIDDLE_2;
}
break;
case WORD_END_3:
if (target.status != Status.WORD_END_3)
{
target.status = Status.WORD_MIDDLE_2;
}
if (target.getValue() == null)
{
add = true;
}
target.setValue(node.getValue());
break;
}
}
else
{
BaseNode newChild[] = new BaseNode[child.length + 1];
int insert = -(index + 1);
System.arraycopy(child, 0, newChild, 0, insert);
System.arraycopy(child, insert, newChild, insert + 1, child.length - insert);
newChild[insert] = node;
child = newChild;
add = true;
}
return add;
}

/**
* @param c 节点的字符
* @param status 节点状态
* @param value 值
*/
public Node(char c, Status status, V value)
{
this.c = c;
this.status = status;
this.value = value;
}

public Node()
{
}

@Override
public BaseNode getChild(char c)
{
if (child == null) return null;
int index = ArrayTool.binarySearch(child, c);
if (index < 0) return null;

return child[index];
}
}

+ 27
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/_EmptyValueArray.java View File

@@ -0,0 +1,27 @@
/*
* <summary></summary>
* <author>hankcs</author>
* <email>me@hankcs.com</email>
* <create-date>2015/5/15 10:24</create-date>
*
* <copyright file="EmptyValueArray.java">
* Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/
* </copyright>
*/
package com.hankcs.hanlp.collection.trie.bintrie;

/**
* @author hankcs
*/
public class _EmptyValueArray<V> extends _ValueArray<V>
{
public _EmptyValueArray()
{
}

@Override
public V nextValue()
{
return null;
}
}

+ 44
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/_ValueArray.java View File

@@ -0,0 +1,44 @@
/*
* <summary></summary>
* <author>hankcs</author>
* <email>me@hankcs.com</email>
* <create-date>2015/5/15 10:23</create-date>
*
* <copyright file="ValueArray.java">
* Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/
* </copyright>
*/
package com.hankcs.hanlp.collection.trie.bintrie;

/**
* 对值数组的包装,可以方便地取下一个
* @author hankcs
*/
public class _ValueArray<V>
{
V[] value;
int offset;

public _ValueArray(V[] value)
{
this.value = value;
}

public V nextValue()
{
return value[offset++];
}

/**
* 仅仅给子类用,不要用
*/
protected _ValueArray()
{
}

public _ValueArray setValue(V[] value)
{
this.value = value;
return this;
}
}

+ 73
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/bintrie/util/ArrayTool.java View File

@@ -0,0 +1,73 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/5/3 12:32</create-date>
*
* <copyright file="ArrayTool.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.trie.bintrie.util;


import com.hankcs.hanlp.collection.trie.bintrie.BaseNode;

/**
* @author He Han
*/
public class ArrayTool
{
/**
* 二分查找
* @param branches 数组
* @param node 要查找的node
* @return 数组下标,小于0表示没找到
*/
public static int binarySearch(BaseNode[] branches, BaseNode node)
{
int high = branches.length - 1;
if (branches.length < 1)
{
return high;
}
int low = 0;
while (low <= high)
{
int mid = (low + high) >>> 1;
int cmp = branches[mid].compareTo(node);

if (cmp < 0)
low = mid + 1;
else if (cmp > 0)
high = mid - 1;
else
return mid;
}
return -(low + 1);
}

public static int binarySearch(BaseNode[] branches, char node)
{
int high = branches.length - 1;
if (branches.length < 1)
{
return high;
}
int low = 0;
while (low <= high)
{
int mid = (low + high) >>> 1;
int cmp = branches[mid].compareTo(node);

if (cmp < 0)
low = mid + 1;
else if (cmp > 0)
high = mid - 1;
else
return mid;
}
return -(low + 1);
}
}

+ 19
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/CharacterMapping.java View File

@@ -0,0 +1,19 @@
package com.hankcs.hanlp.collection.trie.datrie;

/**
* 字符映射接口
*/
public interface CharacterMapping
{
int getInitSize();

int getCharsetSize();

int zeroId();

int[] toIdList(String key);

int[] toIdList(int codePoint);

String toString(int[] ids);
}

+ 220
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/IntArrayList.java View File

@@ -0,0 +1,220 @@
package com.hankcs.hanlp.collection.trie.datrie;

import com.hankcs.hanlp.corpus.io.ByteArray;
import com.hankcs.hanlp.corpus.io.ICacheAble;

import java.io.*;
import java.util.ArrayList;

/**
* 动态数组
*/
public class IntArrayList implements Serializable, ICacheAble
{
private static final long serialVersionUID = 1908530358259070518L;
private int[] data;
/**
* 实际size
*/
private int size;
/**
* 线性递增
*/
private int linearExpandFactor;

public void setLinearExpandFactor(int linearExpandFactor)
{
this.linearExpandFactor = linearExpandFactor;
}

/**
* 是否指数递增
*/
private boolean exponentialExpanding = false;

public boolean isExponentialExpanding()
{
return exponentialExpanding;
}

public void setExponentialExpanding(boolean multiplyExpanding)
{
this.exponentialExpanding = multiplyExpanding;
}

private double exponentialExpandFactor = 1.5;

public double getExponentialExpandFactor()
{
return exponentialExpandFactor;
}

public void setExponentialExpandFactor(double exponentialExpandFactor)
{
this.exponentialExpandFactor = exponentialExpandFactor;
}

public IntArrayList()
{
this(1024);
}

public IntArrayList(int capacity)
{
this(capacity, 10240);
}

public IntArrayList(int capacity, int linearExpandFactor)
{
this.data = new int[capacity];
this.size = 0;
this.linearExpandFactor = linearExpandFactor;
}

private void expand()
{
if (!exponentialExpanding)
{
int[] newData = new int[this.data.length + this.linearExpandFactor];
System.arraycopy(this.data, 0, newData, 0, this.data.length);
this.data = newData;
}
else
{
int[] newData = new int[(int) (this.data.length * exponentialExpandFactor)];
System.arraycopy(this.data, 0, newData, 0, this.data.length);
this.data = newData;
}
}

/**
* 在数组尾部新增一个元素
*
* @param element
*/
public void append(int element)
{
if (this.size == this.data.length)
{
expand();
}
this.data[this.size] = element;
this.size += 1;
}

/**
* 去掉多余的buffer
*/
public void loseWeight()
{
if (size == data.length)
{
return;
}
int[] newData = new int[size];
System.arraycopy(this.data, 0, newData, 0, size);
this.data = newData;
}

public int size()
{
return this.size;
}

public int getLinearExpandFactor()
{
return this.linearExpandFactor;
}

public void set(int index, int value)
{
this.data[index] = value;
}

public int get(int index)
{
return this.data[index];
}

public void removeLast()
{
--size;
}

public int getLast()
{
return data[size - 1];
}

public void setLast(int value)
{
data[size - 1] = value;
}

public int pop()
{
return data[--size];
}

@Override
public void save(DataOutputStream out) throws IOException
{
out.writeInt(size);
for (int i = 0; i < size; i++)
{
out.writeInt(data[i]);
}
out.writeInt(linearExpandFactor);
out.writeBoolean(exponentialExpanding);
out.writeDouble(exponentialExpandFactor);
}

@Override
public boolean load(ByteArray byteArray)
{
if (byteArray == null)
{
return false;
}
size = byteArray.nextInt();
data = new int[size];
for (int i = 0; i < size; i++)
{
data[i] = byteArray.nextInt();
}
linearExpandFactor = byteArray.nextInt();
exponentialExpanding = byteArray.nextBoolean();
exponentialExpandFactor = byteArray.nextDouble();
return true;
}

private void writeObject(ObjectOutputStream out) throws IOException
{
loseWeight();
out.writeInt(size);
out.writeObject(data);
out.writeInt(linearExpandFactor);
out.writeBoolean(exponentialExpanding);
out.writeDouble(exponentialExpandFactor);
}

private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException
{
size = in.readInt();
data = (int[]) in.readObject();
linearExpandFactor = in.readInt();
exponentialExpanding = in.readBoolean();
exponentialExpandFactor = in.readDouble();
}

@Override
public String toString()
{
ArrayList<Integer> head = new ArrayList<Integer>(20);
for (int i = 0; i < Math.min(size, 20); ++i)
{
head.add(data[i]);
}
return head.toString();
}
}

+ 428
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/MutableDoubleArrayTrie.java View File

@@ -0,0 +1,428 @@
/*
* <author>Hankcs</author>
* <email>me@hankcs.com</email>
* <create-date>2017-11-17 下午1:48</create-date>
*
* <copyright file="MutableDoubleArrayTrie.java" company="码农场">
* Copyright (c) 2017, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
package com.hankcs.hanlp.collection.trie.datrie;

import java.util.*;

/**
* 泛型可变双数组trie树
*
* @author hankcs
*/
public class MutableDoubleArrayTrie<V> implements SortedMap<String, V>, Iterable<Map.Entry<String, V>>
{
MutableDoubleArrayTrieInteger trie;
ArrayList<V> values;

public MutableDoubleArrayTrie()
{
trie = new MutableDoubleArrayTrieInteger();
values = new ArrayList<V>();
}

public MutableDoubleArrayTrie(Map<String, V> map)
{
this();
putAll(map);
}

/**
* 去掉多余的buffer
*/
public void loseWeight()
{
trie.loseWeight();
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder("MutableDoubleArrayTrie{");
sb.append("size=").append(size()).append(',');
sb.append("allocated=").append(trie.getBaseArraySize()).append(',');
sb.append('}');
return sb.toString();
}

@Override
public Comparator<? super String> comparator()
{
return new Comparator<String>()
{
@Override
public int compare(String o1, String o2)
{
return o1.compareTo(o2);
}
};
}

@Override
public SortedMap<String, V> subMap(String fromKey, String toKey)
{
throw new UnsupportedOperationException();
}

@Override
public SortedMap<String, V> headMap(String toKey)
{
throw new UnsupportedOperationException();
}

@Override
public SortedMap<String, V> tailMap(String fromKey)
{
throw new UnsupportedOperationException();
}

@Override
public String firstKey()
{
return trie.iterator().key();
}

@Override
public String lastKey()
{
MutableDoubleArrayTrieInteger.KeyValuePair iterator = trie.iterator();
while (iterator.hasNext())
{
iterator.next();
}
return iterator.key();
}

@Override
public int size()
{
return trie.size();
}

@Override
public boolean isEmpty()
{
return trie.isEmpty();
}

@Override
public boolean containsKey(Object key)
{
if (key == null || !(key instanceof String))
return false;
return trie.containsKey((String) key);
}

@Override
public boolean containsValue(Object value)
{
return values.contains(value);
}

@Override
public V get(Object key)
{
if (key == null)
return null;
int id;
if (key instanceof String)
{
id = trie.get((String) key);
}
else
{
id = trie.get(key.toString());
}
if (id == -1)
return null;
return values.get(id);
}

@Override
public V put(String key, V value)
{
int id = trie.get(key);
if (id == -1)
{
trie.set(key, values.size());
values.add(value);
return null;
}
else
{
V v = values.get(id);
values.set(id, value);
return v;
}
}

@Override
public V remove(Object key)
{
if (key == null) return null;
int id = trie.remove(key instanceof String ? (String) key : key.toString());
if (id == -1)
return null;
trie.decreaseValues(id);
return values.remove(id);
}

@Override
public void putAll(Map<? extends String, ? extends V> m)
{
for (Entry<? extends String, ? extends V> entry : m.entrySet())
{
put(entry.getKey(), entry.getValue());
}
}

@Override
public void clear()
{
trie.clear();
values.clear();
}

@Override
public Set<String> keySet()
{
return new Set<String>()
{
MutableDoubleArrayTrieInteger.KeyValuePair iterator = trie.iterator();

@Override
public int size()
{
return trie.size();
}

@Override
public boolean isEmpty()
{
return trie.isEmpty();
}

@Override
public boolean contains(Object o)
{
throw new UnsupportedOperationException();
}

@Override
public Iterator<String> iterator()
{
return new Iterator<String>()
{
@Override
public boolean hasNext()
{
return iterator.hasNext();
}

@Override
public String next()
{
return iterator.next().key();
}

@Override
public void remove()
{
throw new UnsupportedOperationException();
}
};
}

@Override
public Object[] toArray()
{
return values.toArray();
}

@Override
public <T> T[] toArray(T[] a)
{
return values.toArray(a);
}

@Override
public boolean add(String s)
{
throw new UnsupportedOperationException();
}

@Override
public boolean remove(Object o)
{
return trie.remove((String) o) != -1;
}

@Override
public boolean containsAll(Collection<?> c)
{
for (Object o : c)
{
if (!trie.containsKey((String) o))
return false;
}
return true;
}

@Override
public boolean addAll(Collection<? extends String> c)
{
throw new UnsupportedOperationException();
}

@Override
public boolean retainAll(Collection<?> c)
{
throw new UnsupportedOperationException();
}

@Override
public boolean removeAll(Collection<?> c)
{
boolean changed = false;
for (Object o : c)
{
if (!changed)
changed = MutableDoubleArrayTrie.this.remove(o) != null;
}
return changed;
}

@Override
public void clear()
{
MutableDoubleArrayTrie.this.clear();
}
};
}

@Override
public Collection<V> values()
{
return values;
}

@Override
public Set<Entry<String, V>> entrySet()
{
return new Set<Entry<String, V>>()
{
@Override
public int size()
{
return trie.size();
}

@Override
public boolean isEmpty()
{
return trie.isEmpty();
}

@Override
public boolean contains(Object o)
{
throw new UnsupportedOperationException();
}

@Override
public Iterator<Entry<String, V>> iterator()
{
return new Iterator<Entry<String, V>>()
{
MutableDoubleArrayTrieInteger.KeyValuePair iterator = trie.iterator();

@Override
public boolean hasNext()
{
return iterator.hasNext();
}

@Override
public Entry<String, V> next()
{
iterator.next();
return new AbstractMap.SimpleEntry<String, V>(iterator.key(), values.get(iterator.value()));
}

@Override
public void remove()
{
throw new UnsupportedOperationException();
}
};
}

@Override
public Object[] toArray()
{
throw new UnsupportedOperationException();
}

@Override
public <T> T[] toArray(T[] a)
{
throw new UnsupportedOperationException();
}

@Override
public boolean add(Entry<String, V> stringVEntry)
{
throw new UnsupportedOperationException();
}

@Override
public boolean remove(Object o)
{
throw new UnsupportedOperationException();
}

@Override
public boolean containsAll(Collection<?> c)
{
throw new UnsupportedOperationException();
}

@Override
public boolean addAll(Collection<? extends Entry<String, V>> c)
{
throw new UnsupportedOperationException();
}

@Override
public boolean retainAll(Collection<?> c)
{
throw new UnsupportedOperationException();
}

@Override
public boolean removeAll(Collection<?> c)
{
throw new UnsupportedOperationException();
}

@Override
public void clear()
{
MutableDoubleArrayTrie.this.clear();
}
};
}

@Override
public Iterator<Entry<String, V>> iterator()
{
return entrySet().iterator();
}
}

+ 1385
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/MutableDoubleArrayTrieInteger.java
File diff suppressed because it is too large
View File


+ 119
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/Utf8CharacterMapping.java View File

@@ -0,0 +1,119 @@
package com.hankcs.hanlp.collection.trie.datrie;

import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;

/**
* UTF-8编码到int的映射
*/
public class Utf8CharacterMapping implements CharacterMapping, Serializable
{
private static final long serialVersionUID = -6529481088518753872L;
private static final int N = 256;
private static final int[] EMPTYLIST = new int[0];
public static final Charset UTF_8 = Charset.forName("UTF-8");

@Override
public int getInitSize()
{
return N;
}

@Override
public int getCharsetSize()
{
return N;
}

@Override
public int zeroId()
{
return 0;
}

@Override
public int[] toIdList(String key)
{

byte[] bytes = key.getBytes(UTF_8);
int[] res = new int[bytes.length];
for (int i = 0; i < res.length; i++)
{
res[i] = bytes[i] & 0xFF; // unsigned byte
}
if ((res.length == 1) && (res[0] == 0))
{
return EMPTYLIST;
}
return res;
}

/**
* codes ported from iconv lib in utf8.h utf8_codepointtomb
*/
@Override
public int[] toIdList(int codePoint)
{
int count;
if (codePoint < 0x80)
count = 1;
else if (codePoint < 0x800)
count = 2;
else if (codePoint < 0x10000)
count = 3;
else if (codePoint < 0x200000)
count = 4;
else if (codePoint < 0x4000000)
count = 5;
else if (codePoint <= 0x7fffffff)
count = 6;
else
return EMPTYLIST;
int[] r = new int[count];
switch (count)
{ /* note: code falls through cases! */
case 6:
r[5] = (char) (0x80 | (codePoint & 0x3f));
codePoint = codePoint >> 6;
codePoint |= 0x4000000;
case 5:
r[4] = (char) (0x80 | (codePoint & 0x3f));
codePoint = codePoint >> 6;
codePoint |= 0x200000;
case 4:
r[3] = (char) (0x80 | (codePoint & 0x3f));
codePoint = codePoint >> 6;
codePoint |= 0x10000;
case 3:
r[2] = (char) (0x80 | (codePoint & 0x3f));
codePoint = codePoint >> 6;
codePoint |= 0x800;
case 2:
r[1] = (char) (0x80 | (codePoint & 0x3f));
codePoint = codePoint >> 6;
codePoint |= 0xc0;
case 1:
r[0] = (char) codePoint;
}
return r;
}

@Override
public String toString(int[] ids)
{
byte[] bytes = new byte[ids.length];
for (int i = 0; i < ids.length; i++)
{
bytes[i] = (byte) ids[i];
}
try
{
return new String(bytes, "UTF-8");
}
catch (UnsupportedEncodingException e)
{
return null;
}
}
}

+ 14
- 0
core/src/main/java/com/hankcs/hanlp/collection/trie/datrie/package-info.java View File

@@ -0,0 +1,14 @@
/*
* <author>Hankcs</author>
* <email>me@hankcs.com</email>
* <create-date>2018-02-28 下午9:17</create-date>
*
* <copyright file="package-info.java" company="码农场">
* Copyright (c) 2018, 码农场. All Right Reserved, http://www.hankcs.com/
* This source is subject to Hankcs. Please contact Hankcs to get more information.
* </copyright>
*/
/**
* 可变双数组trie树,可以当做Map<String, V>来用。如果V是int,可以直接用MutableDoubleArrayTrieInteger
*/
package com.hankcs.hanlp.collection.trie.datrie;

+ 43
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLFixer.java View File

@@ -0,0 +1,43 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/19 18:55</create-date>
*
* <copyright file="CoNLLFixer.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.CoNll;

import com.hankcs.hanlp.corpus.io.IOUtil;

/**
* 修正一些非10行的依存语料
* @author hankcs
*/
public class CoNLLFixer
{
public static boolean fix(String path)
{
StringBuilder sbOut = new StringBuilder();
for (String line : IOUtil.readLineListWithLessMemory(path))
{
if (line.trim().length() == 0)
{
sbOut.append(line);
sbOut.append('\n');
continue;
}
String[] args = line.split("\t");
for (int i = 10 - args.length; i > 0; --i)
{
line += "\t_";
}
sbOut.append(line);
sbOut.append('\n');
}
return IOUtil.saveTxt(path + ".fixed.txt", sbOut.toString());
}
}

+ 41
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLLoader.java View File

@@ -0,0 +1,41 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/19 18:53</create-date>
*
* <copyright file="CoNLLLoader.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.CoNll;

import com.hankcs.hanlp.corpus.io.IOUtil;

import java.util.LinkedList;

/**
* CoNLL格式依存语料加载
* @author hankcs
*/
public class CoNLLLoader
{
public static LinkedList<CoNLLSentence> loadSentenceList(String path)
{
LinkedList<CoNLLSentence> result = new LinkedList<CoNLLSentence>();
LinkedList<CoNllLine> lineList = new LinkedList<CoNllLine>();
for (String line : IOUtil.readLineListWithLessMemory(path))
{
if (line.trim().length() == 0)
{
result.add(new CoNLLSentence(lineList));
lineList = new LinkedList<CoNllLine>();
continue;
}
lineList.add(new CoNllLine(line.split("\t")));
}

return result;
}
}

+ 164
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLSentence.java View File

@@ -0,0 +1,164 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/20 11:05</create-date>
*
* <copyright file="CoNLLSentence.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.CoNll;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/**
* CoNLL中的一个句子
* @author hankcs
*/
public class CoNLLSentence implements Iterable<CoNLLWord>
{
/**
* 有许多行,每行是一个单词
*/
public CoNLLWord[] word;

/**
* 构造一个句子
* @param lineList
*/
public CoNLLSentence(List<CoNllLine> lineList)
{
CoNllLine[] lineArray = lineList.toArray(new CoNllLine[0]);
this.word = new CoNLLWord[lineList.size()];
int i = 0;
for (CoNllLine line : lineList)
{
word[i++] = new CoNLLWord(line);
}
for (CoNLLWord nllWord : word)
{
int head = Integer.parseInt(lineArray[nllWord.ID - 1].value[6]) - 1;
if (head != -1)
{
nllWord.HEAD = word[head];
}
else
{
nllWord.HEAD = CoNLLWord.ROOT;
}
}
}

public CoNLLSentence(CoNLLWord[] word)
{
this.word = word;
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder(word.length * 50);
for (CoNLLWord word : this.word)
{
sb.append(word);
sb.append('\n');
}
return sb.toString();
}

/**
* 获取边的列表,edge[i][j]表示id为i的词语与j存在一条依存关系为该值的边,否则为null
* @return
*/
public String[][] getEdgeArray()
{
String[][] edge = new String[word.length + 1][word.length + 1];
for (CoNLLWord coNLLWord : word)
{
edge[coNLLWord.ID][coNLLWord.HEAD.ID] = coNLLWord.DEPREL;
}

return edge;
}

/**
* 获取包含根节点在内的单词数组
* @return
*/
public CoNLLWord[] getWordArrayWithRoot()
{
CoNLLWord[] wordArray = new CoNLLWord[word.length + 1];
wordArray[0] = CoNLLWord.ROOT;
System.arraycopy(word, 0, wordArray, 1, word.length);

return wordArray;
}

public CoNLLWord[] getWordArray()
{
return word;
}

@Override
public Iterator<CoNLLWord> iterator()
{
return new Iterator<CoNLLWord>()
{
int index;
@Override
public boolean hasNext()
{
return index < word.length;
}

@Override
public CoNLLWord next()
{
return word[index++];
}

@Override
public void remove()
{
throw new UnsupportedOperationException("CoNLLSentence是只读对象,不允许删除");
}
};
}

/**
* 找出所有子节点
* @param word
* @return
*/
public List<CoNLLWord> findChildren(CoNLLWord word)
{
List<CoNLLWord> result = new LinkedList<CoNLLWord>();
for (CoNLLWord other : this)
{
if (other.HEAD == word)
result.add(other);
}
return result;
}

/**
* 找出特定依存关系的子节点
* @param word
* @param relation
* @return
*/
public List<CoNLLWord> findChildren(CoNLLWord word, String relation)
{
List<CoNLLWord> result = new LinkedList<CoNLLWord>();
for (CoNLLWord other : this)
{
if (other.HEAD == word && other.DEPREL.equals(relation))
result.add(other);
}
return result;
}
}

+ 125
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNLLWord.java View File

@@ -0,0 +1,125 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/20 12:38</create-date>
*
* <copyright file="CoNLLWord.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.CoNll;

/**
* @author hankcs
*/
public class CoNLLWord
{
/**
* ID 当前词在句子中的序号,1开始.
*/
public int ID;
/**
* 当前词语(或标点)的原型或词干,在中文中,此列与FORM相同
*/
public String LEMMA;
/**
* 当前词语的词性(粗粒度)
*/
public String CPOSTAG;
/**
* 当前词语的词性(细粒度)
*/
public String POSTAG;
/**
* 当前词语的中心词
*/
public CoNLLWord HEAD;
/**
* 当前词语与中心词的依存关系
*/
public String DEPREL;

/**
* 等效字符串
*/
public String NAME;

/**
* 根节点
*/
public static final CoNLLWord ROOT = new CoNLLWord(0, "##核心##", "ROOT", "root");
/**
* 空白节点,用于描述下标超出word数组的词语
*/
public static final CoNLLWord NULL = new CoNLLWord(-1, "##空白##", "NULL", "null");

/**
*
* @param ID 当前词在句子中的序号,1开始.
* @param LEMMA 当前词语(或标点)的原型或词干,在中文中,此列与FORM相同
* @param POSTAG 当前词语的词性(细粒度)
*/
public CoNLLWord(int ID, String LEMMA, String POSTAG)
{
this.ID = ID;
this.LEMMA = LEMMA;
this.CPOSTAG = POSTAG.substring(0, 1); // 取首字母作为粗粒度词性
this.POSTAG = POSTAG;
compile();
}

/**
*
* @param ID 当前词在句子中的序号,1开始.
* @param LEMMA 当前词语(或标点)的原型或词干,在中文中,此列与FORM相同
* @param CPOSTAG 当前词语的词性(粗粒度)
* @param POSTAG 当前词语的词性(细粒度)
*/
public CoNLLWord(int ID, String LEMMA, String CPOSTAG, String POSTAG)
{
this.ID = ID;
this.LEMMA = LEMMA;
this.CPOSTAG = CPOSTAG;
this.POSTAG = POSTAG;
compile();
}

private void compile()
{
this.NAME = PosTagCompiler.compile(POSTAG, LEMMA);
}

public CoNLLWord(CoNllLine line)
{
LEMMA = line.value[2];
CPOSTAG = line.value[3];
POSTAG = line.value[4];
DEPREL = line.value[7];
ID = line.id;
compile();
}

public CoNLLWord(CoNllLine[] lineArray, int index)
{
this(lineArray[index]);
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder();
// ID为0时为根节点,ID为-1时为空白节点
if (ID!=0 && ID!=-1){
sb.append(ID).append('\t').append(LEMMA).append('\t').append(LEMMA).append('\t').append(CPOSTAG).append('\t')
.append(POSTAG).append('\t').append('_').append('\t').append(HEAD.ID).append('\t').append(DEPREL).append('\t')
.append('_').append('\t').append('_');
} else {
sb.append(ID).append('\t').append(LEMMA).append('\t').append(LEMMA).append('\t').append(CPOSTAG).append('\t')
.append(POSTAG).append('\t').append('_').append('\t').append('_').append('\t').append(DEPREL).append('\t')
.append('_').append('\t').append('_');
}
return sb.toString();
}
}

+ 51
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/CoNllLine.java View File

@@ -0,0 +1,51 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/20 11:06</create-date>
*
* <copyright file="CoNllLine.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.CoNll;

/**
* CoNLL语料中的一行
* @author hankcs
*/
public class CoNllLine
{
/**
* 十个值
*/
public String[] value = new String[10];

/**
* 第一个值化为id
*/
public int id;

public CoNllLine(String... args)
{
int length = Math.min(args.length, value.length);
for (int i = 0; i < length; ++i)
{
value[i] = args[i];
}
id = Integer.parseInt(value[0]);
}

@Override
public String toString()
{
final StringBuilder sb = new StringBuilder();
for (String value : this.value)
{
sb.append(value);
sb.append('\t');
}
return sb.deleteCharAt(sb.length() - 1).toString();
}
}

+ 90
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/Evaluator.java View File

@@ -0,0 +1,90 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/26 16:21</create-date>
*
* <copyright file="Evaluater.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.CoNll;

import java.text.NumberFormat;

/**
* 测试工具
* @author hankcs
*/
public class Evaluator
{
float U, L, D, A;
int sentenceCount;
long start;

public Evaluator()
{
start = System.currentTimeMillis();
}

public void e(CoNLLSentence right, CoNLLSentence test)
{
++sentenceCount;
A += right.word.length;
for (int i = 0; i < test.word.length; ++i)
{
if (test.word[i].HEAD.ID == right.word[i].HEAD.ID)
{
++U;
if (right.word[i].DEPREL.equals(test.word[i].DEPREL))
{
++L;
if (test.word[i].HEAD.ID != 0)
{
++D;
}
}
}
}
}

public float getUA()
{
return U / A;
}

public float getLA()
{
return L / A;
}

public float getDA()
{
return D / (A - sentenceCount);
}

@Override
public String toString()
{
NumberFormat percentFormat = NumberFormat.getPercentInstance();
percentFormat.setMinimumFractionDigits(2);
StringBuilder sb = new StringBuilder();
sb.append("UA: ");
sb.append(percentFormat.format(getUA()));
sb.append('\t');
sb.append("LA: ");
sb.append(percentFormat.format(getLA()));
sb.append('\t');
sb.append("DA: ");
sb.append(percentFormat.format(getDA()));
sb.append('\t');
sb.append("sentences: ");
sb.append(sentenceCount);
sb.append('\t');
sb.append("speed: ");
sb.append(sentenceCount / (float)(System.currentTimeMillis() - start) * 1000);
sb.append(" sent/s");
return sb.toString();
}
}

+ 63
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/CoNll/PosTagCompiler.java View File

@@ -0,0 +1,63 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/20 13:54</create-date>
*
* <copyright file="PostTagCompiler.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.CoNll;

import com.hankcs.hanlp.utility.Predefine;

/**
* 等效词编译器
* @author hankcs
*/
public class PosTagCompiler
{
/**
* 编译,比如将词性为数词的转为##数##
* @param tag 标签
* @param name 原词
* @return 编译后的等效词
*/
public static String compile(String tag, String name)
{
if (tag.startsWith("m")) return Predefine.TAG_NUMBER;
else if (tag.startsWith("nr")) return Predefine.TAG_PEOPLE;
else if (tag.startsWith("ns")) return Predefine.TAG_PLACE;
else if (tag.startsWith("nt")) return Predefine.TAG_GROUP;
else if (tag.startsWith("t")) return Predefine.TAG_TIME;
else if (tag.equals("x")) return Predefine.TAG_CLUSTER;
else if (tag.equals("nx")) return Predefine.TAG_PROPER;
else if (tag.equals("xx")) return Predefine.TAG_OTHER;

// switch (tag)
// {
// case "m":
// case "mq":
// return Predefine.TAG_NUMBER;
// case "nr":
// case "nr1":
// case "nr2":
// case "nrf":
// case "nrj":
// return Predefine.TAG_PEOPLE;
// case "ns":
// case "nsf":
// return Predefine.TAG_PLACE;
// case "nt":
// return Predefine.TAG_TIME;
// case "x":
// return Predefine.TAG_CLUSTER;
// case "nx":
// return Predefine.TAG_PROPER;
// }

return name;
}
}

+ 97
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/model/MaxEntDependencyModelMaker.java View File

@@ -0,0 +1,97 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/25 20:53</create-date>
*
* <copyright file="MaxEntDependencyModelMaker.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.model;

import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLLoader;
import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence;
import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord;
import com.hankcs.hanlp.corpus.dictionary.DictionaryMaker;
import com.hankcs.hanlp.corpus.io.IOUtil;

import java.io.*;
import java.util.*;

/**
* 最大熵模型构建工具,训练暂时不使用自己的代码,借用opennlp训练。本maker只生成训练文件
*
* @author hankcs
*/
public class MaxEntDependencyModelMaker
{
public static boolean makeModel(String corpusLoadPath, String modelSavePath) throws IOException
{
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(IOUtil.newOutputStream(modelSavePath)));
LinkedList<CoNLLSentence> sentenceList = CoNLLLoader.loadSentenceList(corpusLoadPath);
int id = 1;
for (CoNLLSentence sentence : sentenceList)
{
System.out.printf("%d / %d...", id++, sentenceList.size());
String[][] edgeArray = sentence.getEdgeArray();
CoNLLWord[] word = sentence.getWordArrayWithRoot();
for (int i = 0; i < word.length; ++i)
{
for (int j = 0; j < word.length; ++j)
{
if (i == j) continue;
// 这就是一个边的实例,从i出发,到j,当然它可能存在也可能不存在,不存在取null照样是一个实例
List<String> contextList = new LinkedList<String>();
// 先生成i和j的原子特征
contextList.addAll(generateSingleWordContext(word, i, "i"));
contextList.addAll(generateSingleWordContext(word, j, "j"));
// 然后生成二元组的特征
contextList.addAll(generateUniContext(word, i, j));
// 将特征字符串化
for (String f : contextList)
{
bw.write(f);
bw.write(' ');
}
// 事件名称为依存关系
bw.write("" + edgeArray[i][j]);
bw.newLine();
}
}
System.out.println("done.");
}
bw.close();
return true;
}

public static Collection<String> generateSingleWordContext(CoNLLWord[] word, int index, String mark)
{
Collection<String> context = new LinkedList<String>();
for (int i = index - 2; i < index + 2 + 1; ++i)
{
CoNLLWord w = i >= 0 && i < word.length ? word[i] : CoNLLWord.NULL;
context.add(w.NAME + mark + (i - index)); // 在尾巴上做个标记,不然特征冲突了
context.add(w.POSTAG + mark + (i - index));
}

return context;
}

public static Collection<String> generateUniContext(CoNLLWord[] word, int i, int j)
{
Collection<String> context = new LinkedList<String>();
context.add(word[i].NAME + '→' + word[j].NAME);
context.add(word[i].POSTAG + '→' + word[j].POSTAG);
context.add(word[i].NAME + '→' + word[j].NAME + (i - j));
context.add(word[i].POSTAG + '→' + word[j].POSTAG + (i - j));
CoNLLWord wordBeforeI = i - 1 >= 0 ? word[i - 1] : CoNLLWord.NULL;
CoNLLWord wordBeforeJ = j - 1 >= 0 ? word[j - 1] : CoNLLWord.NULL;
context.add(wordBeforeI.NAME + '@' + word[i].NAME + '→' + word[j].NAME);
context.add(word[i].NAME + '→' + wordBeforeJ.NAME + '@' + word[j].NAME);
context.add(wordBeforeI.POSTAG + '@' + word[i].POSTAG + '→' + word[j].POSTAG);
context.add(word[i].POSTAG + '→' + wordBeforeJ.POSTAG + '@' + word[j].POSTAG);
return context;
}
}

+ 81
- 0
core/src/main/java/com/hankcs/hanlp/corpus/dependency/model/WordNatureWeightModelMaker.java View File

@@ -0,0 +1,81 @@
/*
* <summary></summary>
* <author>He Han</author>
* <email>hankcs.cn@gmail.com</email>
* <create-date>2014/11/20 12:27</create-date>
*
* <copyright file="WordNatureWeightScorer.java" company="上海林原信息科技有限公司">
* Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
* This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
* </copyright>
*/
package com.hankcs.hanlp.corpus.dependency.model;

import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLLoader;
import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLSentence;
import com.hankcs.hanlp.corpus.dependency.CoNll.CoNLLWord;
import com.hankcs.hanlp.corpus.dictionary.DictionaryMaker;
import com.hankcs.hanlp.corpus.document.sentence.word.Word;
import com.hankcs.hanlp.corpus.io.IOUtil;

import java.util.Set;
import java.util.TreeSet;

/**
* 生成模型打分器模型构建工具
*
* @author hankcs
*/
public class WordNatureWeightModelMaker
{
public static boolean makeModel(String corpusLoadPath, String modelSavePath)
{
Set<String> posSet = new TreeSet<String>();
DictionaryMaker dictionaryMaker = new DictionaryMaker();
for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath))
{
for (CoNLLWord word : sentence.word)
{
addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(word.NAME, wrapTag(word.HEAD.POSTAG ), word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
posSet.add(word.POSTAG);
}
}
for (CoNLLSentence sentence : CoNLLLoader.loadSentenceList(corpusLoadPath))
{
for (CoNLLWord word : sentence.word)
{
addPair(word.NAME, word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(word.NAME, wrapTag(word.HEAD.POSTAG ), word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), word.HEAD.NAME, word.DEPREL, dictionaryMaker);
addPair(wrapTag(word.POSTAG), wrapTag(word.HEAD.POSTAG), word.DEPREL, dictionaryMaker);
posSet.add(word.POSTAG);
}
}
StringBuilder sb = new StringBuilder();
for (String pos : posSet)
{
sb.append("case \"" + pos + "\":\n");
}
IOUtil.saveTxt("data/model/dependency/pos-thu.txt", sb.toString());
return dictionaryMaker.saveTxtTo(modelSavePath);
}

private static void addPair(String from, String to, String label, DictionaryMaker dictionaryMaker)
{
dictionaryMaker.add(new Word(from + "@" + to, label));
dictionaryMaker.add(new Word(from + "@", "频次"));
}

/**
* 用尖括号将标签包起来
* @param tag
* @return
*/
public static String wrapTag(String tag)
{
return "<" + tag + ">";
}
}

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save