diff --git a/README.md b/README.md index 51a5441..cdea319 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ Type "help", "copyright", "credits" or "license" for more information. History: +* 1.4.1: Added an optional param default_scheme to the url_normalize * 1.4.0: A bit of code refactoring and cleanup * 1.3.3: Support empty string and double slash urls (//domain.tld) * 1.3.2: Same code support both Python 3 and Python 2. @@ -45,7 +46,7 @@ History: * 1.1.2: support for shebang (#!) urls * 1.1.1: using 'http' schema by default when appropriate * 1.1.0: added handling of IDN domains -* 1.0.0: code pep8-zation +* 1.0.0: code pep8 * 0.1.0: forked from Sam Ruby's urlnorm.py License: "Python" (PSF) License diff --git a/pyproject.toml b/pyproject.toml index 8ed8581..d4ab1a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "url-normalize" -version = "1.4.0" +version = "1.4.1" description = "URL normalization for Python" authors = ["Nikolay Panov "] license = "PSF" diff --git a/tests/test_provide_url_scheme.py b/tests/test_provide_url_scheme.py index abde7de..991aea5 100644 --- a/tests/test_provide_url_scheme.py +++ b/tests/test_provide_url_scheme.py @@ -18,3 +18,13 @@ def test_provide_url_scheme_result_is_expected(): result = provide_url_scheme(url) assert result == expected, url + + +def test_provide_url_scheme_accept_default_scheme_param(): + """Assert we could provide default_scheme param other than https.""" + url = "//site/path" + expected = "http://site/path" + + actual = provide_url_scheme(url, default_scheme="http") + + assert actual == expected diff --git a/tests/test_url_normalize.py b/tests/test_url_normalize.py index a98fb99..dad043e 100644 --- a/tests/test_url_normalize.py +++ b/tests/test_url_normalize.py @@ -92,3 +92,13 @@ def test_url_normalize_results(): """Assert url_normalize return expected results.""" for value, expected in EXPECTED_RESULTS.items(): assert expected == url_normalize(value), value + + +def test_url_normalize_with_http_scheme(): + """Assert we could use http scheme as default.""" + url = "//www.foo.com/" + expected = "http://www.foo.com/" + + actual = url_normalize(url, default_scheme='http') + + assert actual == expected \ No newline at end of file diff --git a/url_normalize/__init__.py b/url_normalize/__init__.py index 971e213..912554b 100644 --- a/url_normalize/__init__.py +++ b/url_normalize/__init__.py @@ -20,17 +20,6 @@ http://intertwingly.net/blog/2004/08/04/Urlnorm This fork author: Nikolay Panov () -History: -* 1.4.0: A bit of code refactoring and cleanup -* 1.3.3: Support empty string and double slash urls (//domain.tld) -* 1.3.2: Same code support both Python 3 and Python 2. -* 1.3.1: Python 3 compatibility -* 1.2.1: PEP8, setup.py -* 1.1.2: support for shebang (#!) urls -* 1.1.1: using 'http' schema by default when appropriate -* 1.1.0: added handling of IDN domains -* 1.0.0: code pep8-zation -* 0.1.0: forked from Sam Ruby's urlnorm.py """ from __future__ import absolute_import @@ -38,6 +27,6 @@ from .url_normalize import url_normalize __license__ = "Python" -__version__ = "1.4.0" +__version__ = "1.4.1" __all__ = ["url_normalize"] diff --git a/url_normalize/url_normalize.py b/url_normalize/url_normalize.py index 44aee13..0c2d96d 100644 --- a/url_normalize/url_normalize.py +++ b/url_normalize/url_normalize.py @@ -17,27 +17,29 @@ "ws": "80", "wss": "443", } +DEFAULT_CHARSET = "utf-8" DEFAULT_SCHEME = "https" -def provide_url_scheme(url): +def provide_url_scheme(url, default_scheme=DEFAULT_SCHEME): """Make sure we have valid url scheme. Params: url : string : the URL + default_scheme : string : default scheme to use, e.g. 'https' Returns: string : updated url with validated/attached scheme """ has_scheme = ":" in url[:7] - is_default_scheme = url.startswith("//") - is_file_path = url == "-" or (url.startswith("/") and not is_default_scheme) + is_universal_scheme = url.startswith("//") + is_file_path = url == "-" or (url.startswith("/") and not is_universal_scheme) if not url or has_scheme or is_file_path: return url - if is_default_scheme: - return DEFAULT_SCHEME + ":" + url - return DEFAULT_SCHEME + "://" + url + if is_universal_scheme: + return default_scheme + ":" + url + return default_scheme + "://" + url def generic_url_cleanup(url): @@ -86,7 +88,7 @@ def normalize_userinfo(userinfo): return userinfo -def normalize_host(host, charset="utf-8"): +def normalize_host(host, charset=DEFAULT_CHARSET): """Normalize host part of the url. Lowercase and strip of final dot. @@ -204,7 +206,7 @@ def normalize_query(query): return query -def url_normalize(url, charset="utf-8"): +def url_normalize(url, charset=DEFAULT_CHARSET, default_scheme=DEFAULT_SCHEME): """URI normalization routine. Sometimes you get an URL by a user that just isn't a real @@ -218,10 +220,14 @@ def url_normalize(url, charset="utf-8"): Params: charset : string : optional The target charset for the URL if the url was given as unicode string. + + Returns: + string : a normalized url + """ if not url: return url - url = provide_url_scheme(url) + url = provide_url_scheme(url, default_scheme) url = generic_url_cleanup(url) url_elements = deconstruct_url(url) url_elements = url_elements._replace(