Skip to content

Commit

Permalink
📝 docs: Update README and docstrings with multiline text handling
Browse files Browse the repository at this point in the history
- Added note in README about handling multiline text for language detection
- Updated example outputs in README for clarity
- Added docstrings to infer.py functions for better documentation
- Added test for exception handling in detect function in test_detect.py
  • Loading branch information
sudoskys committed Sep 14, 2024
1 parent 0b4d872 commit 37aca03
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 7 deletions.
25 changes: 20 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,37 @@ model.
### Native API (Recommended)

> [!NOTE]
> This function assumes to be given a single line of text. *You should remove `\n` characters before passing the text.*
> If the sample is too long or too short, the accuracy will decrease (for example, in the case of too short, Chinese
> will be predicted as Japanese).
```python
from fast_langdetect import detect, detect_multilingual

# Single language detection
print(detect("Hello, world!"))
# Output: {'lang': 'en', 'score': 0.1520957201719284}
# Output: {'lang': 'en', 'score': 0.12450417876243591}

multiline_text = """
Hello, world!
This is a multiline text.
But we need remove `\n` characters or it will raise an ValueError.
"""
multiline_text = multiline_text.replace("\n", "")
print(detect(multiline_text))
# Output: {'lang': 'en', 'score': 0.8509423136711121}

print(detect("Привет, мир!")["lang"])
# Output: ru

# Multi-language detection
print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
# Output: [
# {'lang': 'ru', 'score': 0.39008623361587524},
# {'lang': 'zh', 'score': 0.18235979974269867},
# ]
# Output: [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]

# Multi-language detection with low memory mode disabled
print(detect_multilingual("Hello, world!你好世界!Привет, мир!", low_memory=False))
# Output: [{'lang': 'ru', 'score': 0.39008623361587524}, {'lang': 'zh', 'score': 0.18235979974269867}, {'lang': 'ja', 'score': 0.08473210036754608}, {'lang': 'sr', 'score': 0.057975586503744125}, {'lang': 'en', 'score': 0.05422825738787651}]
```

### Convenient `detect_language` Function
Expand Down
4 changes: 2 additions & 2 deletions feature_test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from fast_langdetect import detect, detect_multilingual, detect_language

# 测试繁体,简体,日文,英文,韩文,法文,德文,西班牙文

print(detect_multilingual("Hello, world!你好世界!Привет, мир!",low_memory=False))
print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
# [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]
print(detect("hello world"))

print(detect("你好世界"))
print(detect_language("Привет, мир!"))
print(detect_language("你好世界"))
print(detect_language("こんにちは世界"))
Expand Down
27 changes: 27 additions & 0 deletions src/fast_langdetect/ft_detect/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,17 @@ def detect(text: str, *,
low_memory: bool = True,
model_download_proxy: str = None
) -> Dict[str, Union[str, float]]:
"""
Detect language of text
This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
:param text: Text for language detection
:param low_memory: Whether to use low memory mode
:param model_download_proxy: model download proxy
:return: {"lang": "en", "score": 0.99}
:raise ValueError: predict processes one line at a time (remove \'\\n\')
"""
model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy)
labels, scores = model.predict(text)
label = labels[0].replace("__label__", '')
Expand All @@ -94,6 +105,22 @@ def detect_multilingual(text: str, *,
threshold: float = 0.0,
on_unicode_error: str = "strict"
) -> List[dict]:
"""
Given a string, get a list of labels and a list of corresponding probabilities.
k controls the number of returned labels. A choice of 5, will return the 5 most probable labels.
By default this returns only the most likely label and probability. threshold filters the returned labels by a threshold on probability. A choice of 0.5 will return labels with at least 0.5 probability.
k and threshold will be applied together to determine the returned labels.
NOTE:This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
:param text: Text for language detection
:param low_memory: Whether to use low memory mode
:param model_download_proxy: model download proxy
:param k: Predict top k languages
:param threshold: Threshold for prediction
:param on_unicode_error: Error handling
:return:
"""
model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy)
labels, scores = model.predict(text=text, k=k, threshold=threshold, on_unicode_error=on_unicode_error)
detect_result = []
Expand Down
8 changes: 8 additions & 0 deletions tests/test_detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,11 @@ def test_detect_totally():
assert detect_language(
"這些機構主辦的課程,多以基本電腦使用為主,例如文書處理、中文輸入、互聯網應用等"
) == "ZH", "ft_detect error"


def test_failed_example():
from fast_langdetect import detect
try:
detect("hello world\nNEW LINE", low_memory=True)
except Exception as e:
assert isinstance(e, Exception), "ft_detect exception error"

0 comments on commit 37aca03

Please sign in to comment.