-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathremove_long_words_mapper.py
55 lines (46 loc) · 1.94 KB
/
remove_long_words_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# Some code here has been modified from:
# https://huggingface.co/spaces/huggingface/text-data-filtering
# --------------------------------------------------------
import sys
from ..base_op import OPERATORS, Mapper
from ..common import (SPECIAL_CHARACTERS, merge_on_whitespace_tab_newline,
split_on_newline_tab_whitespace, strip)
@OPERATORS.register_module('remove_long_words_mapper')
class RemoveLongWordsMapper(Mapper):
"""Mapper to remove long words within a specific range."""
_batched_op = True
def __init__(self,
min_len: int = 1,
max_len: int = sys.maxsize,
*args,
**kwargs):
"""
Initialization method.
:param min_len: The min mapper word length in this op, words
will be filtered if their length is below this parameter.
:param max_len: The max mapper word length in this op, words
will be filtered if their length exceeds this parameter.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.min_len = min_len
self.max_len = max_len
def should_keep_long_word(self, word):
if self.min_len <= len(word) <= self.max_len:
return True
elif self.min_len <= len(strip(word,
SPECIAL_CHARACTERS)) <= self.max_len:
return True
else:
return False
def process_batched(self, samples):
for idx, text in enumerate(samples[self.text_key]):
sentences = split_on_newline_tab_whitespace(text)
sentences = [[[
word for word in subsentence
if self.should_keep_long_word(word)
] for subsentence in sentence] for sentence in sentences]
samples[self.text_key][idx] = merge_on_whitespace_tab_newline(
sentences)
return samples