diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml new file mode 100644 index 00000000..ee6a60f0 --- /dev/null +++ b/.github/workflows/maven.yml @@ -0,0 +1,31 @@ +# This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-maven + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Java CI with Maven + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + java-version: '17' + distribution: 'temurin' + cache: maven + - name: Run the Maven verify phase + run: mvn --batch-mode --update-snapshots verify diff --git a/.gitignore b/.gitignore index a53ac3d1..d3c68d00 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,12 @@ /work /logs /.idea -/target -/out +**/target +**/out .DS_Store *.iml \.* !.travis.yml +*.class +*.jar +*.zip \ No newline at end of file diff --git a/core/src/test/java/org/wltea/analyzer/TestUtils.java b/core/src/test/java/org/wltea/analyzer/TestUtils.java new file mode 100644 index 00000000..49c1b809 --- /dev/null +++ b/core/src/test/java/org/wltea/analyzer/TestUtils.java @@ -0,0 +1,47 @@ +package org.wltea.analyzer; + +import org.wltea.analyzer.cfg.Configuration; +import org.wltea.analyzer.dic.Dictionary; +import java.io.File; +import java.nio.file.FileSystems; +import java.nio.file.Path; + +public class TestUtils { + + public static Configuration createFakeConfigurationSub(boolean useSmart) { + FakeConfigurationSub configurationSub = new FakeConfigurationSub(useSmart); + Dictionary.initial(configurationSub); + return configurationSub; + } + + /** + * ES插件需要指向ES的配置目录,这里使用当前项目的config目录作为配置目录,避免依赖计算机上安装ES + */ + static class FakeConfigurationSub extends Configuration + { + public FakeConfigurationSub(boolean useSmart) { + this.useSmart = useSmart; + } + + @Override + public Path getConfDir() { + return getConfigDir(); + } + + @Override + public Path getConfigInPluginDir() { + return getConfigDir(); + } + + @Override + public Path getPath(String first, String... more) { + return FileSystems.getDefault().getPath(first, more); + } + + private static Path getConfigDir() + { + String projectRoot = new File(System.getProperty("user.dir")).getParentFile().getAbsolutePath(); + return new File(projectRoot, "config").toPath(); + } + } +} diff --git a/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java b/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java new file mode 100644 index 00000000..36cd76fb --- /dev/null +++ b/core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java @@ -0,0 +1,143 @@ +package org.wltea.analyzer.lucene; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.junit.Test; +import org.wltea.analyzer.cfg.Configuration; +import org.wltea.analyzer.TestUtils; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class IKAnalyzerTests { + + /** + * 单char汉字+一个Surrogate Pair + */ + @Test + public void tokenizeCase1_correctly() + { + Configuration cfg = TestUtils.createFakeConfigurationSub(false); + String[] values = tokenize(cfg, "菩\uDB84\uDD2E"); + assert values.length == 2; + assert values[0].equals("菩"); + assert values[1].equals("\uDB84\uDD2E"); + } + + /** + * 单char汉字+一个Surrogate Pair+单char汉字 + */ + @Test + public void tokenizeCase2_correctly() + { + Configuration cfg = TestUtils.createFakeConfigurationSub(false); + String[] values = tokenize(cfg, "菩\uDB84\uDD2E凤"); + assert values.length == 3; + assert values[0].equals("菩"); + assert values[1].equals("\uDB84\uDD2E"); + assert values[2].equals("凤"); + } + + /** + * 单char汉字和多Surrogate Pair混合 + */ + @Test + public void tokenizeCase3_correctly() + { + Configuration cfg = TestUtils.createFakeConfigurationSub(false); + String[] values = tokenize(cfg, "菩\uDB84\uDD2E剃\uDB84\uDC97"); + assert values.length == 4; + assert values[0].equals("菩"); + assert values[1].equals("\uDB84\uDD2E"); + assert values[2].equals("剃"); + assert values[3].equals("\uDB84\uDC97"); + } + + /** + * 单char汉字和多个连续Surrogate Pair混合 + */ + @Test + public void tokenizeCase4_correctly() + { + Configuration cfg = TestUtils.createFakeConfigurationSub(false); + String[] values = tokenize(cfg, "菩\uDB84\uDD2E\uDB84\uDC97"); + assert values.length == 3; + assert values[0].equals("菩"); + assert values[1].equals("\uDB84\uDD2E"); + assert values[2].equals("\uDB84\uDC97"); + } + + /** + * 单char汉字和多个连续Surrogate Pair加词库中的词 + */ + @Test + public void tokenizeCase5_correctly() + { + Configuration cfg = TestUtils.createFakeConfigurationSub(false); + String[] values = tokenize(cfg, "菩\uDB84\uDD2E龟龙麟凤凤"); + assert values.length == 4; + assert values[0].equals("菩"); + assert values[1].equals("\uDB84\uDD2E"); + assert values[2].equals("龟龙麟凤"); + assert values[3].equals("凤"); + } + + /** + * 用ik_max_word分词器分词 + */ + @Test + public void tokenize_max_word_correctly() + { + Configuration cfg = TestUtils.createFakeConfigurationSub(false); + List values = Arrays.asList(tokenize(cfg, "中华人民共和国国歌")); + assert values.size() >= 9; + assert values.contains("中华人民共和国"); + assert values.contains("中华人民"); + assert values.contains("中华"); + assert values.contains("华人"); + assert values.contains("人民共和国"); + assert values.contains("人民"); + assert values.contains("共和国"); + assert values.contains("共和"); + assert values.contains("国歌"); + } + + /** + * 用ik_smart分词器分词 + */ + @Test + public void tokenize_smart_correctly() + { + Configuration cfg = TestUtils.createFakeConfigurationSub(true); + List values = Arrays.asList(tokenize(cfg, "中华人民共和国国歌")); + assert values.size() == 2; + assert values.contains("中华人民共和国"); + assert values.contains("国歌"); + } + + static String[] tokenize(Configuration configuration, String s) + { + ArrayList tokens = new ArrayList<>(); + try (IKAnalyzer ikAnalyzer = new IKAnalyzer(configuration)) { + TokenStream tokenStream = ikAnalyzer.tokenStream("text", s); + tokenStream.reset(); + + while(tokenStream.incrementToken()) + { + CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); + OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); + int len = offsetAttribute.endOffset()-offsetAttribute.startOffset(); + char[] chars = new char[len]; + System.arraycopy(charTermAttribute.buffer(), 0, chars, 0, len); + tokens.add(new String(chars)); + } + } + catch (Exception ex) + { + throw new RuntimeException(ex); + } + return tokens.toArray(new String[0]); + } +}