From 0700dd711ba949d962b118e0191d916f967514ad Mon Sep 17 00:00:00 2001
From: Arseny Tolmachev <arseny@kotonoha.ws>
Date: Wed, 29 Sep 2021 17:15:41 +0900
Subject: [PATCH] add prototype of sentence splitter public API

also add split-only mode to the sudachi binary
---
 sudachi-cli/src/main.rs          | 25 +++++++--
 sudachi/src/lib.rs               |  1 +
 sudachi/src/sentence_splitter.rs | 95 ++++++++++++++++++++++++++++++++
 3 files changed, 115 insertions(+), 6 deletions(-)
 create mode 100644 sudachi/src/sentence_splitter.rs
diff --git a/sudachi-cli/src/main.rs b/sudachi-cli/src/main.rs
index 8582600d..132a94db 100644
--- a/sudachi-cli/src/main.rs
+++ b/sudachi-cli/src/main.rs
@@ -24,6 +24,7 @@ use structopt::StructOpt;
 use sudachi::config::Config;
 use sudachi::dic::dictionary::JapaneseDictionary;
 use sudachi::prelude::*;
+use sudachi::sentence_splitter::{SentenceSplitter, SplitSentences};
 use sudachi::stateless_tokeniser::StatelessTokenizer;
 
 #[cfg(feature = "bake_dictionary")]
@@ -69,10 +70,14 @@ struct Cli {
     /// If None, it refer config and then baked dictionary
     #[structopt(short = "l", long = "dict")]
     dictionary_path: Option<PathBuf>,
+
+    /// Only split sentences, do not perform analysis
+    #[structopt(long = "only-split-sentences")]
+    only_split_sentences: bool,
 }
 
 fn main() {
-    let args = Cli::from_args();
+    let args: Cli = Cli::from_args();
 
     let mode = match args.mode.as_str().parse() {
         Ok(mode) => mode,
@@ -115,14 +120,22 @@ fn main() {
         .unwrap_or_else(|e| panic!("Failed to create dictionary: {:?}", e));
     let tokenizer = StatelessTokenizer::new(&dict);
 
+    let splitter = SentenceSplitter::with_limit(32 * 1024);
+
     // tokenize and output results
     for line in reader.lines() {
         let input = line.expect("Failed to read line");
-        for morpheme_list in tokenizer
-            .tokenize_sentences(&input, mode, enable_debug)
-            .expect("Failed to tokenize input")
-        {
-            write_sentence(&mut writer, morpheme_list, print_all, wakati)
+        for (_, sentence) in splitter.split(&input) {
+            if args.only_split_sentences {
+                writeln!(&mut writer, "{}", sentence)
+                    .expect("Failed to write output");
+                continue;
+            }
+
+            let morphemes = tokenizer.tokenize(sentence, mode, enable_debug)
+                .expect("Failed to tokenize input");
+
+            write_sentence(&mut writer, morphemes, print_all, wakati)
                 .expect("Failed to write output");
         }
     }
diff --git a/sudachi/src/lib.rs b/sudachi/src/lib.rs
index 8472e85e..a554052e 100644
--- a/sudachi/src/lib.rs
+++ b/sudachi/src/lib.rs
@@ -31,6 +31,7 @@ pub mod plugin;
 pub mod sentence_detector;
 pub mod stateless_tokeniser;
 pub mod tokenizer;
+pub mod sentence_splitter;
 
 pub mod prelude {
     pub use crate::{
diff --git a/sudachi/src/sentence_splitter.rs b/sudachi/src/sentence_splitter.rs
new file mode 100644
index 00000000..efe28efe
--- /dev/null
+++ b/sudachi/src/sentence_splitter.rs
@@ -0,0 +1,95 @@
+/*
+ *  Copyright (c) 2021 Works Applications Co., Ltd.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+use std::ops::Range;
+use crate::sentence_detector::SentenceDetector;
+
+pub trait SplitSentences {
+    fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b>;
+}
+
+pub struct SentenceIter<'s, 'x> {
+    splitter: &'x SentenceDetector,
+    data: &'s str,
+    position: usize
+}
+
+impl<'s, 'x> Iterator for SentenceIter<'s, 'x> {
+    type Item = (Range<usize>, &'s str);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.position == self.data.len() {
+            return None
+        }
+        let slice = &self.data[self.position..];
+        let rv = self.splitter.get_eos(slice, None).unwrap();
+        let end = if rv < 0 {
+            self.data.len()
+        } else { self.position + rv as usize };
+
+        let range = self.position..end;
+        let real_slice = &self.data[range.clone()];
+        self.position = end;
+        Some((range, real_slice))
+    }
+}
+
+pub struct SentenceSplitter {
+    detector: SentenceDetector
+}
+
+impl SentenceSplitter {
+    pub fn new() -> SentenceSplitter {
+        SentenceSplitter { detector: SentenceDetector::new() }
+    }
+
+    pub fn with_limit(limit: usize) -> SentenceSplitter {
+        SentenceSplitter { detector: SentenceDetector::with_limit(limit) }
+    }
+}
+
+impl SplitSentences for SentenceSplitter {
+    fn split<'a, 'b>(&'b self, data: &'a str) -> SentenceIter<'a, 'b> {
+        SentenceIter {
+            data: data,
+            position: 0,
+            splitter: &self.detector
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn split_simple() {
+        let splitter = SentenceSplitter::new();
+        let mut iter = splitter.split("テスト。テスト");
+        assert_eq!(iter.next(), Some((0..12, "テスト。")));
+        assert_eq!(iter.next(), Some((12..21, "テスト")));
+        assert_eq!(iter.next(), None);
+    }
+
+    #[test]
+    fn split_longer_sentence() {
+        let splitter = SentenceSplitter::new();
+        let mut iter = splitter.split("　振り返って見ると白い物！　女が軒下で招いている。");
+        assert_eq!(iter.next(), Some((0..39, "\u{3000}振り返って見ると白い物！")));
+        assert_eq!(iter.next(), Some((39..75, "\u{3000}女が軒下で招いている。")));
+        assert_eq!(iter.next(), None)
+    }
+}
\ No newline at end of file