适配Elasticsearch 6.5.3，优化部分代码

KennFalcon · Dec 25, 2018 · b7c5be3 · b7c5be3
1 parent 46d697e
commit b7c5be3
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -8,18 +8,17 @@ HanLP Analyzer for ElasticSearch
 
 此分词器基于[HanLP](http://www.hankcs.com/nlp)，提供了HanLP中大部分的分词方式。
 
-🚩适配Elasticsearch 6.5.2，增加了远程词典的功能，功能类似于medcl大神的[IK分词器插件](https://github.com/medcl/elasticsearch-analysis-ik),因为hanlp有词性的配置，所以远程自定义词典配置稍有不同，需要配置词性和频次。
+🚩适配Elasticsearch 6.5.3，增加了远程词典的功能，功能类似于medcl大神的[IK分词器插件](https://github.com/medcl/elasticsearch-analysis-ik),因为hanlp有词性的配置，所以远程自定义词典配置稍有不同，需要配置词性和频次。
 
 ----------
 
 版本对应
 ----------
 
-### 1. 下载安装ES对应Plugin Release版本
-
 | Plugin version | Elastic version |
 | :------------- | :-------------- |
 | master         | 6.x             |
+| 6.5.3          | 6.5.3           |
 | 6.5.2          | 6.5.2           |
 | 6.5.1          | 6.5.1           |
 | 6.5.0          | 6.5.0           |
@@ -33,6 +32,11 @@ HanLP Analyzer for ElasticSearch
 | 6.2.2          | 6.2.2           |
 | 5.2.2          | 5.2.2           |
 
+安装步骤
+----------
+
+### 1. 下载安装ES对应Plugin Release版本
+
 安装方式：
 
 方式一
@@ -49,7 +53,7 @@ HanLP Analyzer for ElasticSearch
 
    a. 使用elasticsearch插件脚本安装command如下：
 
-   `./bin/elasticsearch-plugin install https://github.com/KennFalcon/elasticsearch-analysis-hanlp/releases/download/v6.5.2/elasticsearch-analysis-hanlp-6.5.2.zip`
+   `./bin/elasticsearch-plugin install https://github.com/KennFalcon/elasticsearch-analysis-hanlp/releases/download/v6.5.3/elasticsearch-analysis-hanlp-6.5.3.zip`
 
 ### 2. 安装数据包
 
@@ -157,6 +161,8 @@ POST http://localhost:9200/twitter2/_analyze
 远程词典配置
 ----------
 
+配置文件为*ES_HOME*/config/analysis-hanlp/hanlp-remote.xml
+
 ```xml
 <properties>
     <comment>HanLP Analyzer 扩展配置</comment>
@@ -169,7 +175,7 @@ POST http://localhost:9200/twitter2/_analyze
 </properties>
 ```
 
-### 远程扩展字典
+### 1. 远程扩展字典
 
 其中words_location为URL或者URL+" "+词性，如：
 
@@ -181,7 +187,7 @@ POST http://localhost:9200/twitter2/_analyze
 
 第二个样例，配置词典URL，同时配置该词典的默认词性nt，当然词典内部同样遵循[单词] [词性A] [A的频次] [词性B] [B的频次] ... 如果不配置词性，则采用默认词性nt。
 
-### 远程扩展停止词字典
+### 2. 远程扩展停止词字典
 
 其中stop_words_location为URL，如：
 

diff --git a/pom.xml b/pom.xml
@@ -10,7 +10,7 @@
     <description>HanLP Analyzer for ElasticSearch</description>
 
     <properties>
-        <elasticsearch.version>6.5.2</elasticsearch.version>
+        <elasticsearch.version>6.5.3</elasticsearch.version>
         <maven.compiler.target>1.8</maven.compiler.target>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <elasticsearch.assembly.descriptor>${project.basedir}/src/main/assemblies/plugin.xml</elasticsearch.assembly.descriptor>

diff --git a/src/main/java/com/hankcs/lucene/BaseHanLPAnalyzer.java b/src/main/java/com/hankcs/lucene/BaseHanLPAnalyzer.java
@@ -26,7 +26,7 @@ abstract class BaseHanLPAnalyzer extends Analyzer {
      * @param configuration 配置信息
      * @return 新segment
      */
-    Segment buildSegment(Segment segment, Configuration configuration) {
+    protected Segment buildSegment(Segment segment, Configuration configuration) {
         segment.enableIndexMode(configuration.isEnableIndexMode())
                 .enableNumberQuantifierRecognize(configuration.isEnableNumberQuantifierRecognize())
                 .enableCustomDictionary(configuration.isEnableCustomDictionary())
@@ -38,13 +38,11 @@ Segment buildSegment(Segment segment, Configuration configuration) {
                 .enablePartOfSpeechTagging(configuration.isEnablePartOfSpeechTagging());
         if (configuration.isEnableTraditionalChineseMode()) {
             segment.enableIndexMode(false);
-            Segment inner = segment;
-            TraditionalChineseTokenizer.SEGMENT = inner;
-            segment = new Segment() {
+            TraditionalChineseTokenizer.SEGMENT = segment;
+            return new Segment() {
                 @Override
                 protected List<Term> segSentence(char[] sentence) {
-                    List<Term> termList = TraditionalChineseTokenizer.segment(new String(sentence));
-                    return termList;
+                    return TraditionalChineseTokenizer.segment(new String(sentence));
                 }
             };
         }
@@ -57,7 +55,7 @@ protected List<Term> segSentence(char[] sentence) {
      * @param configuration 配置信息
      * @return Tokenizer
      */
-    Tokenizer buildBaseTokenizer(Segment segment, Configuration configuration) {
+    protected Tokenizer buildBaseTokenizer(Segment segment, Configuration configuration) {
         return AccessController.doPrivileged((PrivilegedAction<HanLPTokenizer>) () -> new HanLPTokenizer(segment, configuration));
     }
 }
diff --git a/src/main/java/com/hankcs/lucene/HanLPDijkstraAnalyzer.java b/src/main/java/com/hankcs/lucene/HanLPDijkstraAnalyzer.java
@@ -4,10 +4,6 @@
 import com.hankcs.hanlp.seg.Dijkstra.DijkstraSegment;
 import com.hankcs.hanlp.seg.Segment;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Tokenizer;
-
-import java.security.AccessController;
-import java.security.PrivilegedAction;
 
 /**
  * @project: elasticsearch-analysis-hanlp

diff --git a/src/main/java/com/hankcs/lucene/HanLPNLPAnalyzer.java b/src/main/java/com/hankcs/lucene/HanLPNLPAnalyzer.java
@@ -4,10 +4,6 @@
 import com.hankcs.hanlp.HanLP;
 import com.hankcs.hanlp.seg.Segment;
 import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.Tokenizer;
-
-import java.security.AccessController;
-import java.security.PrivilegedAction;
 
 /**
  * @project: elasticsearch-analysis-hanlp