Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add prototype of sentence splitter public API #51

Merged
merged 1 commit into from
Sep 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions sudachi-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use structopt::StructOpt;
use sudachi::config::Config;
use sudachi::dic::dictionary::JapaneseDictionary;
use sudachi::prelude::*;
use sudachi::sentence_splitter::{SentenceSplitter, SplitSentences};
use sudachi::stateless_tokeniser::StatelessTokenizer;

#[cfg(feature = "bake_dictionary")]
Expand Down Expand Up @@ -69,10 +70,14 @@ struct Cli {
/// If None, it refer config and then baked dictionary
#[structopt(short = "l", long = "dict")]
dictionary_path: Option<PathBuf>,

/// Only split sentences, do not perform analysis
#[structopt(long = "only-split-sentences")]
only_split_sentences: bool,
}

fn main() {
let args = Cli::from_args();
let args: Cli = Cli::from_args();

let mode = match args.mode.as_str().parse() {
Ok(mode) => mode,
Expand Down Expand Up @@ -115,14 +120,22 @@ fn main() {
.unwrap_or_else(|e| panic!("Failed to create dictionary: {:?}", e));
let tokenizer = StatelessTokenizer::new(&dict);

let splitter = SentenceSplitter::with_limit(32 * 1024);

// tokenize and output results
for line in reader.lines() {
let input = line.expect("Failed to read line");
for morpheme_list in tokenizer
.tokenize_sentences(&input, mode, enable_debug)
.expect("Failed to tokenize input")
{
write_sentence(&mut writer, morpheme_list, print_all, wakati)
for (_, sentence) in splitter.split(&input) {
if args.only_split_sentences {
writeln!(&mut writer, "{}", sentence)
.expect("Failed to write output");
continue;
}

let morphemes = tokenizer.tokenize(sentence, mode, enable_debug)
.expect("Failed to tokenize input");

write_sentence(&mut writer, morphemes, print_all, wakati)
.expect("Failed to write output");
}
}
Expand Down
1 change: 1 addition & 0 deletions sudachi/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pub mod plugin;
pub mod sentence_detector;
pub mod stateless_tokeniser;
pub mod tokenizer;
pub mod sentence_splitter;

pub mod prelude {
pub use crate::{
Expand Down
95 changes: 95 additions & 0 deletions sudachi/src/sentence_splitter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

use std::ops::Range;
use crate::sentence_detector::SentenceDetector;

pub trait SplitSentences {
fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b>;
}

pub struct SentenceIter<'s, 'x> {
splitter: &'x SentenceDetector,
data: &'s str,
position: usize
}

impl<'s, 'x> Iterator for SentenceIter<'s, 'x> {
type Item = (Range<usize>, &'s str);

fn next(&mut self) -> Option<Self::Item> {
if self.position == self.data.len() {
return None
}
let slice = &self.data[self.position..];
let rv = self.splitter.get_eos(slice, None).unwrap();
let end = if rv < 0 {
self.data.len()
} else { self.position + rv as usize };

let range = self.position..end;
let real_slice = &self.data[range.clone()];
self.position = end;
Some((range, real_slice))
}
}

pub struct SentenceSplitter {
detector: SentenceDetector
}

impl SentenceSplitter {
pub fn new() -> SentenceSplitter {
SentenceSplitter { detector: SentenceDetector::new() }
}

pub fn with_limit(limit: usize) -> SentenceSplitter {
SentenceSplitter { detector: SentenceDetector::with_limit(limit) }
}
}

impl SplitSentences for SentenceSplitter {
fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b> {
SentenceIter {
data: data,
position: 0,
splitter: &self.detector
}
}
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn split_simple() {
let splitter = SentenceSplitter::new();
let mut iter = splitter.split("テスト。テスト");
assert_eq!(iter.next(), Some((0..12, "テスト。")));
assert_eq!(iter.next(), Some((12..21, "テスト")));
assert_eq!(iter.next(), None);
}

#[test]
fn split_longer_sentence() {
let splitter = SentenceSplitter::new();
let mut iter = splitter.split(" 振り返って見ると白い物! 女が軒下で招いている。");
assert_eq!(iter.next(), Some((0..39, "\u{3000}振り返って見ると白い物!")));
assert_eq!(iter.next(), Some((39..75, "\u{3000}女が軒下で招いている。")));
assert_eq!(iter.next(), None)
}
}