-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathsentence_split_mapper.py
37 lines (26 loc) · 1.02 KB
/
sentence_split_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from data_juicer.utils.model_utils import get_model, prepare_model
from ..base_op import OPERATORS, Mapper
from ..common import get_sentences_from_document
OP_NAME = 'sentence_split_mapper'
@OPERATORS.register_module(OP_NAME)
class SentenceSplitMapper(Mapper):
"""Mapper to split text samples to sentences."""
_batched_op = True
def __init__(self, lang: str = 'en', *args, **kwargs):
"""
Initialization method.
:param lang: split sentence of text in which language.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.lang = lang
self.model_key = prepare_model(model_type='nltk', lang=lang)
def process_batched(self, samples):
nltk_model = get_model(self.model_key)
samples[self.text_key] = [
get_sentences_from_document(
text, model_func=nltk_model.tokenize if nltk_model else None)
for text in samples[self.text_key]
]
return samples