Skip to content

Commit

Permalink
Add unit tests and Github actions (#1074)
Browse files Browse the repository at this point in the history
* 支持由两个char组成的Surrogate Pair(比如生僻字、自造字、emoji等)

* 增加测试代码

* 删除Test.java,准备转换为单元测试,然后放到单独的PR中提交

* 1. added Github actions to run test 2. added unit tests
  • Loading branch information
yangzhongke authored Sep 29, 2024
1 parent 8a9f2bf commit 5a04b42
Show file tree
Hide file tree
Showing 4 changed files with 226 additions and 2 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/maven.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# This workflow will build a Java project with Maven, and cache/restore any dependencies to improve the workflow execution time
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-java-with-maven

# This workflow uses actions that are not certified by GitHub.
# They are provided by a third-party and are governed by
# separate terms of service, privacy policy, and support
# documentation.

name: Java CI with Maven

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
java-version: '17'
distribution: 'temurin'
cache: maven
- name: Run the Maven verify phase
run: mvn --batch-mode --update-snapshots verify
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
/work
/logs
/.idea
/target
/out
**/target
**/out
.DS_Store
*.iml
\.*
!.travis.yml
*.class
*.jar
*.zip
47 changes: 47 additions & 0 deletions core/src/test/java/org/wltea/analyzer/TestUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package org.wltea.analyzer;

import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.Path;

public class TestUtils {

public static Configuration createFakeConfigurationSub(boolean useSmart) {
FakeConfigurationSub configurationSub = new FakeConfigurationSub(useSmart);
Dictionary.initial(configurationSub);
return configurationSub;
}

/**
* ES插件需要指向ES的配置目录,这里使用当前项目的config目录作为配置目录,避免依赖计算机上安装ES
*/
static class FakeConfigurationSub extends Configuration
{
public FakeConfigurationSub(boolean useSmart) {
this.useSmart = useSmart;
}

@Override
public Path getConfDir() {
return getConfigDir();
}

@Override
public Path getConfigInPluginDir() {
return getConfigDir();
}

@Override
public Path getPath(String first, String... more) {
return FileSystems.getDefault().getPath(first, more);
}

private static Path getConfigDir()
{
String projectRoot = new File(System.getProperty("user.dir")).getParentFile().getAbsolutePath();
return new File(projectRoot, "config").toPath();
}
}
}
143 changes: 143 additions & 0 deletions core/src/test/java/org/wltea/analyzer/lucene/IKAnalyzerTests.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package org.wltea.analyzer.lucene;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.junit.Test;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.TestUtils;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class IKAnalyzerTests {

/**
* 单char汉字+一个Surrogate Pair
*/
@Test
public void tokenizeCase1_correctly()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
String[] values = tokenize(cfg, "菩\uDB84\uDD2E");
assert values.length == 2;
assert values[0].equals("菩");
assert values[1].equals("\uDB84\uDD2E");
}

/**
* 单char汉字+一个Surrogate Pair+单char汉字
*/
@Test
public void tokenizeCase2_correctly()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
String[] values = tokenize(cfg, "菩\uDB84\uDD2E凤");
assert values.length == 3;
assert values[0].equals("菩");
assert values[1].equals("\uDB84\uDD2E");
assert values[2].equals("凤");
}

/**
* 单char汉字和多Surrogate Pair混合
*/
@Test
public void tokenizeCase3_correctly()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
String[] values = tokenize(cfg, "菩\uDB84\uDD2E\uDB84\uDC97");
assert values.length == 4;
assert values[0].equals("菩");
assert values[1].equals("\uDB84\uDD2E");
assert values[2].equals("剃");
assert values[3].equals("\uDB84\uDC97");
}

/**
* 单char汉字和多个连续Surrogate Pair混合
*/
@Test
public void tokenizeCase4_correctly()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
String[] values = tokenize(cfg, "菩\uDB84\uDD2E\uDB84\uDC97");
assert values.length == 3;
assert values[0].equals("菩");
assert values[1].equals("\uDB84\uDD2E");
assert values[2].equals("\uDB84\uDC97");
}

/**
* 单char汉字和多个连续Surrogate Pair加词库中的词
*/
@Test
public void tokenizeCase5_correctly()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
String[] values = tokenize(cfg, "菩\uDB84\uDD2E龟龙麟凤凤");
assert values.length == 4;
assert values[0].equals("菩");
assert values[1].equals("\uDB84\uDD2E");
assert values[2].equals("龟龙麟凤");
assert values[3].equals("凤");
}

/**
* 用ik_max_word分词器分词
*/
@Test
public void tokenize_max_word_correctly()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(false);
List<String> values = Arrays.asList(tokenize(cfg, "中华人民共和国国歌"));
assert values.size() >= 9;
assert values.contains("中华人民共和国");
assert values.contains("中华人民");
assert values.contains("中华");
assert values.contains("华人");
assert values.contains("人民共和国");
assert values.contains("人民");
assert values.contains("共和国");
assert values.contains("共和");
assert values.contains("国歌");
}

/**
* 用ik_smart分词器分词
*/
@Test
public void tokenize_smart_correctly()
{
Configuration cfg = TestUtils.createFakeConfigurationSub(true);
List<String> values = Arrays.asList(tokenize(cfg, "中华人民共和国国歌"));
assert values.size() == 2;
assert values.contains("中华人民共和国");
assert values.contains("国歌");
}

static String[] tokenize(Configuration configuration, String s)
{
ArrayList<String> tokens = new ArrayList<>();
try (IKAnalyzer ikAnalyzer = new IKAnalyzer(configuration)) {
TokenStream tokenStream = ikAnalyzer.tokenStream("text", s);
tokenStream.reset();

while(tokenStream.incrementToken())
{
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
int len = offsetAttribute.endOffset()-offsetAttribute.startOffset();
char[] chars = new char[len];
System.arraycopy(charTermAttribute.buffer(), 0, chars, 0, len);
tokens.add(new String(chars));
}
}
catch (Exception ex)
{
throw new RuntimeException(ex);
}
return tokens.toArray(new String[0]);
}
}

0 comments on commit 5a04b42

Please sign in to comment.