Skip to content

Commit

Permalink
chore: naming "all" as ALL_NAMESPACE_FLAG and change internel magic…
Browse files Browse the repository at this point in the history
… number
  • Loading branch information
yzqzss committed Dec 4, 2024
1 parent 2af8803 commit 3343127
Show file tree
Hide file tree
Showing 5 changed files with 22 additions and 13 deletions.
5 changes: 3 additions & 2 deletions wikiteam3/dumpgenerator/api/namespaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.api import get_JSON
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.utils.util import ALL_NAMESPACE_FLAG

def getNamespacesScraper(config: Config, session: requests.Session):
"""Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages"""
Expand All @@ -22,7 +23,7 @@ def getNamespacesScraper(config: Config, session: requests.Session):
m = re.compile(
r'<option [^>]*?value=[\'"](?P<namespaceid>\d+)[\'"][^>]*?>(?P<namespacename>[^<]+)</option>'
).finditer(raw)
if "all" in namespaces:
if ALL_NAMESPACE_FLAG in namespaces:
namespaces = []
for i in m:
namespaces.append(int(i.group("namespaceid")))
Expand Down Expand Up @@ -70,7 +71,7 @@ def getNamespacesAPI(config: Config, session: requests.Session):
print(r.text)
raise

if "all" in namespaces:
if ALL_NAMESPACE_FLAG in namespaces:
namespaces = []
for i in nsquery.keys():
if int(i) < 0: # -1: Special, -2: Media, excluding
Expand Down
9 changes: 5 additions & 4 deletions wikiteam3/dumpgenerator/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from wikiteam3.utils.login import uniLogin
from wikiteam3.utils.monkey_patch import SessionMonkeyPatch, WakeTLSAdapter
from wikiteam3.utils.user_agent import setup_random_UserAgent
from wikiteam3.utils.util import ALL_NAMESPACE_FLAG


def getArgumentParser():
Expand Down Expand Up @@ -479,23 +480,23 @@ def sleep(self, response=None):
sys.exit(11)


namespaces = ["all"]
namespaces = [ALL_NAMESPACE_FLAG]
exnamespaces = []
# Process namespace inclusions
if args.namespaces:
# fix, why - ? and... --namespaces= all with a space works?
if (
re.search(r"[^\d, \-]", args.namespaces)
and args.namespaces.lower() != "all"
and args.namespaces.lower() != ALL_NAMESPACE_FLAG
):
print(
"Invalid namespace values.\nValid format is integer(s) separated by commas"
)
sys.exit(1)
else:
ns = re.sub(" ", "", args.namespaces)
if ns.lower() == "all":
namespaces = ["all"]
if ns.lower() == ALL_NAMESPACE_FLAG:
namespaces = [ALL_NAMESPACE_FLAG]
else:
namespaces = [int(i) for i in ns.split(",")]

Expand Down
1 change: 1 addition & 0 deletions wikiteam3/dumpgenerator/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def asdict(self):
xmlrevisions_page: bool = False
images: bool = False
namespaces: List[int] = None
""" [ALL_NAMESPACE_FLAG] or [int,...] """
exnamespaces: List[int] = None
""" save images """

Expand Down
15 changes: 8 additions & 7 deletions wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,17 @@
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import \
make_xml_from_page, make_xml_page_from_raw
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.utils.util import XMLRIVISIONS_INCREMENTAL_DUMP_MARK, mark_as_done
from wikiteam3.utils.util import ALL_NAMESPACE_FLAG, XMLRIVISIONS_INCREMENTAL_DUMP_MARK, mark_as_done

ALL_NAMESPACE = -1
__ALL_NAMESPACE = -20241122
""" magic number refers to ALL_NAMESPACE_FLAG """

def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, site: mwclient.Site, nscontinue=None, arvcontinue: Optional[str]=None):
if "all" not in config.namespaces:
if ALL_NAMESPACE_FLAG not in config.namespaces:
namespaces = config.namespaces
else:
# namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
namespaces = [ALL_NAMESPACE] # magic number refers to "all"
namespaces = [__ALL_NAMESPACE]

# <- increasement xmldump
if env_arvcontinue := os.getenv("ARVCONTINUE", None):
Expand All @@ -44,7 +45,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit

for namespace in namespaces:
# Skip retrived namespace
if namespace == ALL_NAMESPACE:
if namespace == __ALL_NAMESPACE:
assert len(namespaces) == 1, \
"Only one item shoule be there when 'all' namespace are specified"
_nscontinue_input = None
Expand All @@ -63,7 +64,7 @@ def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, sit
"arvlimit": config.api_chunksize,
"arvdir": "newer",
}
if namespace != ALL_NAMESPACE:
if namespace != __ALL_NAMESPACE:
arv_params['arvnamespace'] = namespace
if _arvcontinue_input is not None:
arv_params['arvcontinue'] = _arvcontinue_input
Expand Down Expand Up @@ -488,7 +489,7 @@ def handle_infinite_loop(allrevs_response: Dict, arv_params: Dict, config: Confi
print(f"API warnings: {allrevs_response.get('warnings', {})}")

if "truncated" in allrevs_response.get("warnings",{}).get("result",{}).get("*",""):
# workaround for [truncated API requests for "allrevisions" causes infinite loop ]
# workaround for [truncated API response for "allrevisions" causes infinite loop ]
# (https://github.com/mediawiki-client-tools/mediawiki-scraper/issues/166)
print("Let's try to skip this revision and continue...")
_arv_params_temp = arv_params.copy()
Expand Down
5 changes: 5 additions & 0 deletions wikiteam3/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
UPLOADED_MARK = 'uploaded_to_IA.mark'
XMLRIVISIONS_INCREMENTAL_DUMP_MARK = 'xmlrevisions_incremental_dump.mark'

ALL_NAMESPACE_FLAG = "all"
""" DO NOT CHNAGE THIS VALUE, this magic value is used to work with config.json \n\n"""\
""" I want use "*" as magic flag like MediaWiki does,"""\
""" but "all" exists in wikiteam codebase years ago :( """


def underscore(text: str) -> str:
""" replace(" ", "_") """
Expand Down

0 comments on commit 3343127

Please sign in to comment.