diff --git a/README.md b/README.md index c2425c3..3ef2435 100644 --- a/README.md +++ b/README.md @@ -8,18 +8,17 @@ HanLP Analyzer for ElasticSearch 此分词器基于[HanLP](http://www.hankcs.com/nlp),提供了HanLP中大部分的分词方式。 -🚩适配Elasticsearch 6.5.2,增加了远程词典的功能,功能类似于medcl大神的[IK分词器插件](https://github.com/medcl/elasticsearch-analysis-ik),因为hanlp有词性的配置,所以远程自定义词典配置稍有不同,需要配置词性和频次。 +🚩适配Elasticsearch 6.5.3,增加了远程词典的功能,功能类似于medcl大神的[IK分词器插件](https://github.com/medcl/elasticsearch-analysis-ik),因为hanlp有词性的配置,所以远程自定义词典配置稍有不同,需要配置词性和频次。 ---------- 版本对应 ---------- -### 1. 下载安装ES对应Plugin Release版本 - | Plugin version | Elastic version | | :------------- | :-------------- | | master | 6.x | +| 6.5.3 | 6.5.3 | | 6.5.2 | 6.5.2 | | 6.5.1 | 6.5.1 | | 6.5.0 | 6.5.0 | @@ -33,6 +32,11 @@ HanLP Analyzer for ElasticSearch | 6.2.2 | 6.2.2 | | 5.2.2 | 5.2.2 | +安装步骤 +---------- + +### 1. 下载安装ES对应Plugin Release版本 + 安装方式: 方式一 @@ -49,7 +53,7 @@ HanLP Analyzer for ElasticSearch a. 使用elasticsearch插件脚本安装command如下: - `./bin/elasticsearch-plugin install https://github.com/KennFalcon/elasticsearch-analysis-hanlp/releases/download/v6.5.2/elasticsearch-analysis-hanlp-6.5.2.zip` + `./bin/elasticsearch-plugin install https://github.com/KennFalcon/elasticsearch-analysis-hanlp/releases/download/v6.5.3/elasticsearch-analysis-hanlp-6.5.3.zip` ### 2. 安装数据包 @@ -157,6 +161,8 @@ POST http://localhost:9200/twitter2/_analyze 远程词典配置 ---------- +配置文件为*ES_HOME*/config/analysis-hanlp/hanlp-remote.xml + ```xml HanLP Analyzer 扩展配置 @@ -169,7 +175,7 @@ POST http://localhost:9200/twitter2/_analyze ``` -### 远程扩展字典 +### 1. 远程扩展字典 其中words_location为URL或者URL+" "+词性,如: @@ -181,7 +187,7 @@ POST http://localhost:9200/twitter2/_analyze 第二个样例,配置词典URL,同时配置该词典的默认词性nt,当然词典内部同样遵循[单词] [词性A] [A的频次] [词性B] [B的频次] ... 如果不配置词性,则采用默认词性nt。 -### 远程扩展停止词字典 +### 2. 远程扩展停止词字典 其中stop_words_location为URL,如: diff --git a/pom.xml b/pom.xml index d917b39..e7f93d9 100644 --- a/pom.xml +++ b/pom.xml @@ -10,7 +10,7 @@ HanLP Analyzer for ElasticSearch - 6.5.2 + 6.5.3 1.8 UTF-8 ${project.basedir}/src/main/assemblies/plugin.xml diff --git a/src/main/java/com/hankcs/lucene/BaseHanLPAnalyzer.java b/src/main/java/com/hankcs/lucene/BaseHanLPAnalyzer.java index b8749df..f8d71bf 100644 --- a/src/main/java/com/hankcs/lucene/BaseHanLPAnalyzer.java +++ b/src/main/java/com/hankcs/lucene/BaseHanLPAnalyzer.java @@ -26,7 +26,7 @@ abstract class BaseHanLPAnalyzer extends Analyzer { * @param configuration 配置信息 * @return 新segment */ - Segment buildSegment(Segment segment, Configuration configuration) { + protected Segment buildSegment(Segment segment, Configuration configuration) { segment.enableIndexMode(configuration.isEnableIndexMode()) .enableNumberQuantifierRecognize(configuration.isEnableNumberQuantifierRecognize()) .enableCustomDictionary(configuration.isEnableCustomDictionary()) @@ -38,13 +38,11 @@ Segment buildSegment(Segment segment, Configuration configuration) { .enablePartOfSpeechTagging(configuration.isEnablePartOfSpeechTagging()); if (configuration.isEnableTraditionalChineseMode()) { segment.enableIndexMode(false); - Segment inner = segment; - TraditionalChineseTokenizer.SEGMENT = inner; - segment = new Segment() { + TraditionalChineseTokenizer.SEGMENT = segment; + return new Segment() { @Override protected List segSentence(char[] sentence) { - List termList = TraditionalChineseTokenizer.segment(new String(sentence)); - return termList; + return TraditionalChineseTokenizer.segment(new String(sentence)); } }; } @@ -57,7 +55,7 @@ protected List segSentence(char[] sentence) { * @param configuration 配置信息 * @return Tokenizer */ - Tokenizer buildBaseTokenizer(Segment segment, Configuration configuration) { + protected Tokenizer buildBaseTokenizer(Segment segment, Configuration configuration) { return AccessController.doPrivileged((PrivilegedAction) () -> new HanLPTokenizer(segment, configuration)); } } diff --git a/src/main/java/com/hankcs/lucene/HanLPDijkstraAnalyzer.java b/src/main/java/com/hankcs/lucene/HanLPDijkstraAnalyzer.java index 34ad03a..72535b4 100644 --- a/src/main/java/com/hankcs/lucene/HanLPDijkstraAnalyzer.java +++ b/src/main/java/com/hankcs/lucene/HanLPDijkstraAnalyzer.java @@ -4,10 +4,6 @@ import com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment; import com.hankcs.hanlp.seg.Segment; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Tokenizer; - -import java.security.AccessController; -import java.security.PrivilegedAction; /** * @project: elasticsearch-analysis-hanlp diff --git a/src/main/java/com/hankcs/lucene/HanLPNLPAnalyzer.java b/src/main/java/com/hankcs/lucene/HanLPNLPAnalyzer.java index be48637..77045ca 100644 --- a/src/main/java/com/hankcs/lucene/HanLPNLPAnalyzer.java +++ b/src/main/java/com/hankcs/lucene/HanLPNLPAnalyzer.java @@ -4,10 +4,6 @@ import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.seg.Segment; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.Tokenizer; - -import java.security.AccessController; -import java.security.PrivilegedAction; /** * @project: elasticsearch-analysis-hanlp