Add unit tests and Github actions (#1074)

* 支持由两个char组成的Surrogate Pair（比如生僻字、自造字、emoji等） * 增加测试代码 * 删除Test.java，准备转换为单元测试，然后放到单独的PR中提交 * 1. added Github actions to run test 2. added unit tests
infinilabs · Sep 29, 2024 · 5a04b42 · 5a04b42
1 parent 8a9f2bf
commit 5a04b42
Show file tree

Hide file tree

Showing 4 changed files with 226 additions and 2 deletions.
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
@@ -0,0 +1,31 @@
+# This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-maven
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Java CI with Maven
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up JDK 17
+      uses: actions/setup-java@v4
+      with:
+        java-version: '17'
+        distribution: 'temurin'
+        cache: maven
+    - name: Run the Maven verify phase
+      run: mvn --batch-mode --update-snapshots verify
diff --git a/.gitignore b/.gitignore
@@ -2,9 +2,12 @@
 /work
 /logs
 /.idea
-/target
-/out
+**/target
+**/out
 .DS_Store
 *.iml
 \.*
 !.travis.yml
+*.class
+*.jar
+*.zip
diff --git a/core/src/test/java/org/wltea/analyzer/TestUtils.java b/core/src/test/java/org/wltea/analyzer/TestUtils.java
@@ -0,0 +1,47 @@
+package org.wltea.analyzer;
+
+import org.wltea.analyzer.cfg.Configuration;
+import org.wltea.analyzer.dic.Dictionary;
+import java.io.File;
+import java.nio.file.FileSystems;
+import java.nio.file.Path;
+
+public class TestUtils {
+
+    public static Configuration createFakeConfigurationSub(boolean useSmart) {
+        FakeConfigurationSub configurationSub = new FakeConfigurationSub(useSmart);
+        Dictionary.initial(configurationSub);
+        return configurationSub;
+    }
+
+    /**
+     * ES插件需要指向ES的配置目录，这里使用当前项目的config目录作为配置目录，避免依赖计算机上安装ES
+     */
+    static class FakeConfigurationSub extends Configuration
+    {
+        public FakeConfigurationSub(boolean useSmart) {
+            this.useSmart = useSmart;
+        }
+
+        @Override
+        public Path getConfDir() {
+            return getConfigDir();
+        }
+
+        @Override
+        public Path getConfigInPluginDir() {
+            return getConfigDir();
+        }
+
+        @Override
+        public Path getPath(String first, String... more) {
+            return FileSystems.getDefault().getPath(first, more);
+        }
+
+        private static Path getConfigDir()
+        {
+            String projectRoot = new File(System.getProperty("user.dir")).getParentFile().getAbsolutePath();
+            return new File(projectRoot, "config").toPath();
+        }
+    }
+}
diff --git a/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java b/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java
@@ -0,0 +1,143 @@
+package org.wltea.analyzer.lucene;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.junit.Test;
+import org.wltea.analyzer.cfg.Configuration;
+import org.wltea.analyzer.TestUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class IKAnalyzerTests {
+
+    /**
+     * 单char汉字+一个Surrogate Pair
+     */
+    @Test
+    public void tokenizeCase1_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        String[] values = tokenize(cfg, "菩\uDB84\uDD2E");
+        assert values.length == 2;
+        assert values[0].equals("菩");
+        assert values[1].equals("\uDB84\uDD2E");
+    }
+
+    /**
+     * 单char汉字+一个Surrogate Pair+单char汉字
+     */
+    @Test
+    public void tokenizeCase2_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        String[] values = tokenize(cfg, "菩\uDB84\uDD2E凤");
+        assert values.length == 3;
+        assert values[0].equals("菩");
+        assert values[1].equals("\uDB84\uDD2E");
+        assert values[2].equals("凤");
+    }
+
+    /**
+     * 单char汉字和多Surrogate Pair混合
+     */
+    @Test
+    public void tokenizeCase3_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        String[] values = tokenize(cfg, "菩\uDB84\uDD2E剃\uDB84\uDC97");
+        assert values.length == 4;
+        assert values[0].equals("菩");
+        assert values[1].equals("\uDB84\uDD2E");
+        assert values[2].equals("剃");
+        assert values[3].equals("\uDB84\uDC97");
+    }
+
+    /**
+     * 单char汉字和多个连续Surrogate Pair混合
+     */
+    @Test
+    public void tokenizeCase4_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        String[] values = tokenize(cfg, "菩\uDB84\uDD2E\uDB84\uDC97");
+        assert values.length == 3;
+        assert values[0].equals("菩");
+        assert values[1].equals("\uDB84\uDD2E");
+        assert values[2].equals("\uDB84\uDC97");
+    }
+
+    /**
+     * 单char汉字和多个连续Surrogate Pair加词库中的词
+     */
+    @Test
+    public void tokenizeCase5_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        String[] values = tokenize(cfg, "菩\uDB84\uDD2E龟龙麟凤凤");
+        assert values.length == 4;
+        assert values[0].equals("菩");
+        assert values[1].equals("\uDB84\uDD2E");
+        assert values[2].equals("龟龙麟凤");
+        assert values[3].equals("凤");
+    }
+
+    /**
+     * 用ik_max_word分词器分词
+     */
+    @Test
+    public void tokenize_max_word_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(false);
+        List<String> values = Arrays.asList(tokenize(cfg, "中华人民共和国国歌"));
+        assert values.size() >= 9;
+        assert values.contains("中华人民共和国");
+        assert values.contains("中华人民");
+        assert values.contains("中华");
+        assert values.contains("华人");
+        assert values.contains("人民共和国");
+        assert values.contains("人民");
+        assert values.contains("共和国");
+        assert values.contains("共和");
+        assert values.contains("国歌");
+    }
+
+    /**
+     * 用ik_smart分词器分词
+     */
+    @Test
+    public void tokenize_smart_correctly()
+    {
+        Configuration cfg = TestUtils.createFakeConfigurationSub(true);
+        List<String> values = Arrays.asList(tokenize(cfg, "中华人民共和国国歌"));
+        assert values.size() == 2;
+        assert values.contains("中华人民共和国");
+        assert values.contains("国歌");
+    }
+
+    static String[] tokenize(Configuration configuration, String s)
+    {
+        ArrayList<String> tokens = new ArrayList<>();
+        try (IKAnalyzer ikAnalyzer = new IKAnalyzer(configuration)) {
+            TokenStream tokenStream = ikAnalyzer.tokenStream("text", s);
+            tokenStream.reset();
+
+            while(tokenStream.incrementToken())
+            {
+                CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
+                OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
+                int len = offsetAttribute.endOffset()-offsetAttribute.startOffset();
+                char[] chars = new char[len];
+                System.arraycopy(charTermAttribute.buffer(), 0, chars, 0, len);
+                tokens.add(new String(chars));
+            }
+        }
+        catch (Exception ex)
+        {
+            throw new RuntimeException(ex);
+        }
+        return  tokens.toArray(new String[0]);
+    }
+}