From 3343127efa3984838922c9e3f647b0891b3f0ae5 Mon Sep 17 00:00:00 2001 From: yzqzss Date: Thu, 5 Dec 2024 01:20:29 +0800 Subject: [PATCH] chore: naming "all" as `ALL_NAMESPACE_FLAG` and change internel magic number --- wikiteam3/dumpgenerator/api/namespaces.py | 5 +++-- wikiteam3/dumpgenerator/cli/cli.py | 9 +++++---- wikiteam3/dumpgenerator/config.py | 1 + .../dump/page/xmlrev/xml_revisions.py | 15 ++++++++------- wikiteam3/utils/util.py | 5 +++++ 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/wikiteam3/dumpgenerator/api/namespaces.py b/wikiteam3/dumpgenerator/api/namespaces.py index 07214e13..05fb3358 100644 --- a/wikiteam3/dumpgenerator/api/namespaces.py +++ b/wikiteam3/dumpgenerator/api/namespaces.py @@ -5,6 +5,7 @@ from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.api import get_JSON from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils.util import ALL_NAMESPACE_FLAG def getNamespacesScraper(config: Config, session: requests.Session): """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages""" @@ -22,7 +23,7 @@ def getNamespacesScraper(config: Config, session: requests.Session): m = re.compile( r'' ).finditer(raw) - if "all" in namespaces: + if ALL_NAMESPACE_FLAG in namespaces: namespaces = [] for i in m: namespaces.append(int(i.group("namespaceid"))) @@ -70,7 +71,7 @@ def getNamespacesAPI(config: Config, session: requests.Session): print(r.text) raise - if "all" in namespaces: + if ALL_NAMESPACE_FLAG in namespaces: namespaces = [] for i in nsquery.keys(): if int(i) < 0: # -1: Special, -2: Media, excluding diff --git a/wikiteam3/dumpgenerator/cli/cli.py b/wikiteam3/dumpgenerator/cli/cli.py index 4436ca14..76529b5a 100644 --- a/wikiteam3/dumpgenerator/cli/cli.py +++ b/wikiteam3/dumpgenerator/cli/cli.py @@ -32,6 +32,7 @@ from wikiteam3.utils.login import uniLogin from wikiteam3.utils.monkey_patch import SessionMonkeyPatch, WakeTLSAdapter from wikiteam3.utils.user_agent import setup_random_UserAgent +from wikiteam3.utils.util import ALL_NAMESPACE_FLAG def getArgumentParser(): @@ -479,14 +480,14 @@ def sleep(self, response=None): sys.exit(11) - namespaces = ["all"] + namespaces = [ALL_NAMESPACE_FLAG] exnamespaces = [] # Process namespace inclusions if args.namespaces: # fix, why - ? and... --namespaces= all with a space works? if ( re.search(r"[^\d, \-]", args.namespaces) - and args.namespaces.lower() != "all" + and args.namespaces.lower() != ALL_NAMESPACE_FLAG ): print( "Invalid namespace values.\nValid format is integer(s) separated by commas" @@ -494,8 +495,8 @@ def sleep(self, response=None): sys.exit(1) else: ns = re.sub(" ", "", args.namespaces) - if ns.lower() == "all": - namespaces = ["all"] + if ns.lower() == ALL_NAMESPACE_FLAG: + namespaces = [ALL_NAMESPACE_FLAG] else: namespaces = [int(i) for i in ns.split(",")] diff --git a/wikiteam3/dumpgenerator/config.py b/wikiteam3/dumpgenerator/config.py index 517502e3..0f258fde 100644 --- a/wikiteam3/dumpgenerator/config.py +++ b/wikiteam3/dumpgenerator/config.py @@ -51,6 +51,7 @@ def asdict(self): xmlrevisions_page: bool = False images: bool = False namespaces: List[int] = None + """ [ALL_NAMESPACE_FLAG] or [int,...] """ exnamespaces: List[int] = None """ save images """ diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py index d2bfd20b..8a4f5b7a 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py @@ -18,16 +18,17 @@ from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import \ make_xml_from_page, make_xml_page_from_raw from wikiteam3.dumpgenerator.config import Config -from wikiteam3.utils.util import XMLRIVISIONS_INCREMENTAL_DUMP_MARK, mark_as_done +from wikiteam3.utils.util import ALL_NAMESPACE_FLAG, XMLRIVISIONS_INCREMENTAL_DUMP_MARK, mark_as_done -ALL_NAMESPACE = -1 +__ALL_NAMESPACE = -20241122 +""" magic number refers to ALL_NAMESPACE_FLAG """ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, site: mwclient.Site, nscontinue=None, arvcontinue: Optional[str]=None): - if "all" not in config.namespaces: + if ALL_NAMESPACE_FLAG not in config.namespaces: namespaces = config.namespaces else: # namespaces, namespacenames = getNamespacesAPI(config=config, session=session) - namespaces = [ALL_NAMESPACE] # magic number refers to "all" + namespaces = [__ALL_NAMESPACE] # <- increasement xmldump if env_arvcontinue := os.getenv("ARVCONTINUE", None): @@ -44,7 +45,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit for namespace in namespaces: # Skip retrived namespace - if namespace == ALL_NAMESPACE: + if namespace == __ALL_NAMESPACE: assert len(namespaces) == 1, \ "Only one item shoule be there when 'all' namespace are specified" _nscontinue_input = None @@ -63,7 +64,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit "arvlimit": config.api_chunksize, "arvdir": "newer", } - if namespace != ALL_NAMESPACE: + if namespace != __ALL_NAMESPACE: arv_params['arvnamespace'] = namespace if _arvcontinue_input is not None: arv_params['arvcontinue'] = _arvcontinue_input @@ -488,7 +489,7 @@ def handle_infinite_loop(allrevs_response: Dict, arv_params: Dict, config: Confi print(f"API warnings: {allrevs_response.get('warnings', {})}") if "truncated" in allrevs_response.get("warnings",{}).get("result",{}).get("*",""): - # workaround for [truncated API requests for "allrevisions" causes infinite loop ] + # workaround for [truncated API response for "allrevisions" causes infinite loop ] # (https://github.com/mediawiki-client-tools/mediawiki-scraper/issues/166) print("Let's try to skip this revision and continue...") _arv_params_temp = arv_params.copy() diff --git a/wikiteam3/utils/util.py b/wikiteam3/utils/util.py index ab518b05..cb5a8066 100644 --- a/wikiteam3/utils/util.py +++ b/wikiteam3/utils/util.py @@ -11,6 +11,11 @@ UPLOADED_MARK = 'uploaded_to_IA.mark' XMLRIVISIONS_INCREMENTAL_DUMP_MARK = 'xmlrevisions_incremental_dump.mark' +ALL_NAMESPACE_FLAG = "all" +""" DO NOT CHNAGE THIS VALUE, this magic value is used to work with config.json \n\n"""\ +""" I want use "*" as magic flag like MediaWiki does,"""\ +""" but "all" exists in wikiteam codebase years ago :( """ + def underscore(text: str) -> str: """ replace(" ", "_") """