📝 docs: Update README and docstrings with multiline text handling

- Added note in README about handling multiline text for language detection - Updated example outputs in README for clarity - Added docstrings to infer.py functions for better documentation - Added test for exception handling in detect function in test_detect.py
LlmKira · Sep 14, 2024 · 37aca03 · 37aca03
1 parent 0b4d872
commit 37aca03
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -45,22 +45,37 @@ model.
 
 ### Native API (Recommended)
 
+> [!NOTE]
+> This function assumes to be given a single line of text. *You should remove `\n` characters before passing the text.*
+> If the sample is too long or too short, the accuracy will decrease (for example, in the case of too short, Chinese
+> will be predicted as Japanese).
+
 ```python
 from fast_langdetect import detect, detect_multilingual
 
 # Single language detection
 print(detect("Hello, world!"))
-# Output: {'lang': 'en', 'score': 0.1520957201719284}
+# Output: {'lang': 'en', 'score': 0.12450417876243591}
+
+multiline_text = """
+Hello, world!
+This is a multiline text.
+But we need remove `\n` characters or it will raise an ValueError.
+"""
+multiline_text = multiline_text.replace("\n", "")
+print(detect(multiline_text))
+# Output: {'lang': 'en', 'score': 0.8509423136711121}
 
 print(detect("Привет, мир!")["lang"])
 # Output: ru
 
 # Multi-language detection
 print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
-# Output: [
-#     {'lang': 'ru', 'score': 0.39008623361587524},
-#     {'lang': 'zh', 'score': 0.18235979974269867},
-# ]
+# Output: [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]
+
+# Multi-language detection with low memory mode disabled
+print(detect_multilingual("Hello, world!你好世界!Привет, мир!", low_memory=False))
+# Output: [{'lang': 'ru', 'score': 0.39008623361587524}, {'lang': 'zh', 'score': 0.18235979974269867}, {'lang': 'ja', 'score': 0.08473210036754608}, {'lang': 'sr', 'score': 0.057975586503744125}, {'lang': 'en', 'score': 0.05422825738787651}]
 ```
 
 ### Convenient `detect_language` Function

diff --git a/feature_test/__init__.py b/feature_test/__init__.py
@@ -6,11 +6,11 @@
 from fast_langdetect import detect, detect_multilingual, detect_language
 
 # 测试繁体，简体，日文，英文，韩文，法文，德文，西班牙文
-
+print(detect_multilingual("Hello, world!你好世界!Привет, мир!",low_memory=False))
 print(detect_multilingual("Hello, world!你好世界!Привет, мир!"))
 # [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}]
 print(detect("hello world"))
-
+print(detect("你好世界"))
 print(detect_language("Привет, мир!"))
 print(detect_language("你好世界"))
 print(detect_language("こんにちは世界"))

diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py
@@ -77,6 +77,17 @@ def detect(text: str, *,
            low_memory: bool = True,
            model_download_proxy: str = None
            ) -> Dict[str, Union[str, float]]:
+    """
+    Detect language of text
+
+    This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
+
+    :param text: Text for language detection
+    :param low_memory: Whether to use low memory mode
+    :param model_download_proxy: model download proxy
+    :return: {"lang": "en", "score": 0.99}
+    :raise ValueError: predict processes one line at a time (remove \'\\n\')
+    """
     model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy)
     labels, scores = model.predict(text)
     label = labels[0].replace("__label__", '')
@@ -94,6 +105,22 @@ def detect_multilingual(text: str, *,
                         threshold: float = 0.0,
                         on_unicode_error: str = "strict"
                         ) -> List[dict]:
+    """
+    Given a string, get a list of labels and a list of corresponding probabilities.
+    k controls the number of returned labels. A choice of 5, will return the 5 most probable labels.
+    By default this returns only the most likely label and probability. threshold filters the returned labels by a threshold on probability. A choice of 0.5 will return labels with at least 0.5 probability.
+    k and threshold will be applied together to determine the returned labels.
+
+    NOTE:This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character.
+
+    :param text: Text for language detection
+    :param low_memory: Whether to use low memory mode
+    :param model_download_proxy: model download proxy
+    :param k: Predict top k languages
+    :param threshold: Threshold for prediction
+    :param on_unicode_error: Error handling
+    :return:
+    """
     model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy)
     labels, scores = model.predict(text=text, k=k, threshold=threshold, on_unicode_error=on_unicode_error)
     detect_result = []

diff --git a/tests/test_detect.py b/tests/test_detect.py
@@ -31,3 +31,11 @@ def test_detect_totally():
     assert detect_language(
         "這些機構主辦的課程，多以基本電腦使用為主，例如文書處理、中文輸入、互聯網應用等"
     ) == "ZH", "ft_detect error"
+
+
+def test_failed_example():
+    from fast_langdetect import detect
+    try:
+        detect("hello world\nNEW LINE", low_memory=True)
+    except Exception as e:
+        assert isinstance(e, Exception), "ft_detect exception error"