diff --git a/README.md b/README.md index 25de229..827e317 100644 --- a/README.md +++ b/README.md @@ -45,22 +45,37 @@ model. ### Native API (Recommended) +> [!NOTE] +> This function assumes to be given a single line of text. *You should remove `\n` characters before passing the text.* +> If the sample is too long or too short, the accuracy will decrease (for example, in the case of too short, Chinese +> will be predicted as Japanese). + ```python from fast_langdetect import detect, detect_multilingual # Single language detection print(detect("Hello, world!")) -# Output: {'lang': 'en', 'score': 0.1520957201719284} +# Output: {'lang': 'en', 'score': 0.12450417876243591} + +multiline_text = """ +Hello, world! +This is a multiline text. +But we need remove `\n` characters or it will raise an ValueError. +""" +multiline_text = multiline_text.replace("\n", "") +print(detect(multiline_text)) +# Output: {'lang': 'en', 'score': 0.8509423136711121} print(detect("Привет, мир!")["lang"]) # Output: ru # Multi-language detection print(detect_multilingual("Hello, world!你好世界!Привет, мир!")) -# Output: [ -# {'lang': 'ru', 'score': 0.39008623361587524}, -# {'lang': 'zh', 'score': 0.18235979974269867}, -# ] +# Output: [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}] + +# Multi-language detection with low memory mode disabled +print(detect_multilingual("Hello, world!你好世界!Привет, мир!", low_memory=False)) +# Output: [{'lang': 'ru', 'score': 0.39008623361587524}, {'lang': 'zh', 'score': 0.18235979974269867}, {'lang': 'ja', 'score': 0.08473210036754608}, {'lang': 'sr', 'score': 0.057975586503744125}, {'lang': 'en', 'score': 0.05422825738787651}] ``` ### Convenient `detect_language` Function diff --git a/feature_test/__init__.py b/feature_test/__init__.py index ae66914..e9e4a5c 100644 --- a/feature_test/__init__.py +++ b/feature_test/__init__.py @@ -6,11 +6,11 @@ from fast_langdetect import detect, detect_multilingual, detect_language # 测试繁体,简体,日文,英文,韩文,法文,德文,西班牙文 - +print(detect_multilingual("Hello, world!你好世界!Привет, мир!",low_memory=False)) print(detect_multilingual("Hello, world!你好世界!Привет, мир!")) # [{'lang': 'ja', 'score': 0.32009604573249817}, {'lang': 'uk', 'score': 0.27781224250793457}, {'lang': 'zh', 'score': 0.17542070150375366}, {'lang': 'sr', 'score': 0.08751443773508072}, {'lang': 'bg', 'score': 0.05222449079155922}] print(detect("hello world")) - +print(detect("你好世界")) print(detect_language("Привет, мир!")) print(detect_language("你好世界")) print(detect_language("こんにちは世界")) diff --git a/src/fast_langdetect/ft_detect/infer.py b/src/fast_langdetect/ft_detect/infer.py index 6a7959a..a2d7ba1 100644 --- a/src/fast_langdetect/ft_detect/infer.py +++ b/src/fast_langdetect/ft_detect/infer.py @@ -77,6 +77,17 @@ def detect(text: str, *, low_memory: bool = True, model_download_proxy: str = None ) -> Dict[str, Union[str, float]]: + """ + Detect language of text + + This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character. + + :param text: Text for language detection + :param low_memory: Whether to use low memory mode + :param model_download_proxy: model download proxy + :return: {"lang": "en", "score": 0.99} + :raise ValueError: predict processes one line at a time (remove \'\\n\') + """ model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy) labels, scores = model.predict(text) label = labels[0].replace("__label__", '') @@ -94,6 +105,22 @@ def detect_multilingual(text: str, *, threshold: float = 0.0, on_unicode_error: str = "strict" ) -> List[dict]: + """ + Given a string, get a list of labels and a list of corresponding probabilities. + k controls the number of returned labels. A choice of 5, will return the 5 most probable labels. + By default this returns only the most likely label and probability. threshold filters the returned labels by a threshold on probability. A choice of 0.5 will return labels with at least 0.5 probability. + k and threshold will be applied together to determine the returned labels. + + NOTE:This function assumes to be given a single line of text. We split words on whitespace (space, newline, tab, vertical tab) and the control characters carriage return, formfeed and the null character. + + :param text: Text for language detection + :param low_memory: Whether to use low memory mode + :param model_download_proxy: model download proxy + :param k: Predict top k languages + :param threshold: Threshold for prediction + :param on_unicode_error: Error handling + :return: + """ model = get_model_loaded(low_memory=low_memory, download_proxy=model_download_proxy) labels, scores = model.predict(text=text, k=k, threshold=threshold, on_unicode_error=on_unicode_error) detect_result = [] diff --git a/tests/test_detect.py b/tests/test_detect.py index 533a738..880c46e 100644 --- a/tests/test_detect.py +++ b/tests/test_detect.py @@ -31,3 +31,11 @@ def test_detect_totally(): assert detect_language( "這些機構主辦的課程,多以基本電腦使用為主,例如文書處理、中文輸入、互聯網應用等" ) == "ZH", "ft_detect error" + + +def test_failed_example(): + from fast_langdetect import detect + try: + detect("hello world\nNEW LINE", low_memory=True) + except Exception as e: + assert isinstance(e, Exception), "ft_detect exception error"