From 0700dd711ba949d962b118e0191d916f967514ad Mon Sep 17 00:00:00 2001 From: Arseny Tolmachev Date: Wed, 29 Sep 2021 17:15:41 +0900 Subject: [PATCH] add prototype of sentence splitter public API also add split-only mode to the sudachi binary --- sudachi-cli/src/main.rs | 25 +++++++-- sudachi/src/lib.rs | 1 + sudachi/src/sentence_splitter.rs | 95 ++++++++++++++++++++++++++++++++ 3 files changed, 115 insertions(+), 6 deletions(-) create mode 100644 sudachi/src/sentence_splitter.rs diff --git a/sudachi-cli/src/main.rs b/sudachi-cli/src/main.rs index 8582600d..132a94db 100644 --- a/sudachi-cli/src/main.rs +++ b/sudachi-cli/src/main.rs @@ -24,6 +24,7 @@ use structopt::StructOpt; use sudachi::config::Config; use sudachi::dic::dictionary::JapaneseDictionary; use sudachi::prelude::*; +use sudachi::sentence_splitter::{SentenceSplitter, SplitSentences}; use sudachi::stateless_tokeniser::StatelessTokenizer; #[cfg(feature = "bake_dictionary")] @@ -69,10 +70,14 @@ struct Cli { /// If None, it refer config and then baked dictionary #[structopt(short = "l", long = "dict")] dictionary_path: Option, + + /// Only split sentences, do not perform analysis + #[structopt(long = "only-split-sentences")] + only_split_sentences: bool, } fn main() { - let args = Cli::from_args(); + let args: Cli = Cli::from_args(); let mode = match args.mode.as_str().parse() { Ok(mode) => mode, @@ -115,14 +120,22 @@ fn main() { .unwrap_or_else(|e| panic!("Failed to create dictionary: {:?}", e)); let tokenizer = StatelessTokenizer::new(&dict); + let splitter = SentenceSplitter::with_limit(32 * 1024); + // tokenize and output results for line in reader.lines() { let input = line.expect("Failed to read line"); - for morpheme_list in tokenizer - .tokenize_sentences(&input, mode, enable_debug) - .expect("Failed to tokenize input") - { - write_sentence(&mut writer, morpheme_list, print_all, wakati) + for (_, sentence) in splitter.split(&input) { + if args.only_split_sentences { + writeln!(&mut writer, "{}", sentence) + .expect("Failed to write output"); + continue; + } + + let morphemes = tokenizer.tokenize(sentence, mode, enable_debug) + .expect("Failed to tokenize input"); + + write_sentence(&mut writer, morphemes, print_all, wakati) .expect("Failed to write output"); } } diff --git a/sudachi/src/lib.rs b/sudachi/src/lib.rs index 8472e85e..a554052e 100644 --- a/sudachi/src/lib.rs +++ b/sudachi/src/lib.rs @@ -31,6 +31,7 @@ pub mod plugin; pub mod sentence_detector; pub mod stateless_tokeniser; pub mod tokenizer; +pub mod sentence_splitter; pub mod prelude { pub use crate::{ diff --git a/sudachi/src/sentence_splitter.rs b/sudachi/src/sentence_splitter.rs new file mode 100644 index 00000000..efe28efe --- /dev/null +++ b/sudachi/src/sentence_splitter.rs @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021 Works Applications Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::ops::Range; +use crate::sentence_detector::SentenceDetector; + +pub trait SplitSentences { + fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b>; +} + +pub struct SentenceIter<'s, 'x> { + splitter: &'x SentenceDetector, + data: &'s str, + position: usize +} + +impl<'s, 'x> Iterator for SentenceIter<'s, 'x> { + type Item = (Range, &'s str); + + fn next(&mut self) -> Option { + if self.position == self.data.len() { + return None + } + let slice = &self.data[self.position..]; + let rv = self.splitter.get_eos(slice, None).unwrap(); + let end = if rv < 0 { + self.data.len() + } else { self.position + rv as usize }; + + let range = self.position..end; + let real_slice = &self.data[range.clone()]; + self.position = end; + Some((range, real_slice)) + } +} + +pub struct SentenceSplitter { + detector: SentenceDetector +} + +impl SentenceSplitter { + pub fn new() -> SentenceSplitter { + SentenceSplitter { detector: SentenceDetector::new() } + } + + pub fn with_limit(limit: usize) -> SentenceSplitter { + SentenceSplitter { detector: SentenceDetector::with_limit(limit) } + } +} + +impl SplitSentences for SentenceSplitter { + fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b> { + SentenceIter { + data: data, + position: 0, + splitter: &self.detector + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn split_simple() { + let splitter = SentenceSplitter::new(); + let mut iter = splitter.split("テスト。テスト"); + assert_eq!(iter.next(), Some((0..12, "テスト。"))); + assert_eq!(iter.next(), Some((12..21, "テスト"))); + assert_eq!(iter.next(), None); + } + + #[test] + fn split_longer_sentence() { + let splitter = SentenceSplitter::new(); + let mut iter = splitter.split(" 振り返って見ると白い物! 女が軒下で招いている。"); + assert_eq!(iter.next(), Some((0..39, "\u{3000}振り返って見ると白い物!"))); + assert_eq!(iter.next(), Some((39..75, "\u{3000}女が軒下で招いている。"))); + assert_eq!(iter.next(), None) + } +} \ No newline at end of file