From ee64d5ee86d53215d1148291a92dc07017f47f9e Mon Sep 17 00:00:00 2001 From: yzqzss Date: Tue, 9 Jul 2024 18:45:12 +0800 Subject: [PATCH] feat: increasemental xmldump (--xmlrevisions) PoC --- .../dump/page/xmlrev/xml_revisions.py | 13 +++++++++++- wikiteam3/tools/get_arvcontinue.py | 21 +++++++++++++++++++ wikiteam3/uploader/uploader.py | 3 ++- wikiteam3/utils/util.py | 1 + 4 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 wikiteam3/tools/get_arvcontinue.py diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py index 5a4c7cd1..d2bfd20b 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py @@ -1,4 +1,5 @@ from datetime import datetime +import os import sys import time from typing import Dict, List, Optional @@ -17,15 +18,25 @@ from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import \ make_xml_from_page, make_xml_page_from_raw from wikiteam3.dumpgenerator.config import Config +from wikiteam3.utils.util import XMLRIVISIONS_INCREMENTAL_DUMP_MARK, mark_as_done ALL_NAMESPACE = -1 -def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, site: mwclient.Site, nscontinue=None, arvcontinue=None): +def getXMLRevisionsByAllRevisions(config: Config, session: requests.Session, site: mwclient.Site, nscontinue=None, arvcontinue: Optional[str]=None): if "all" not in config.namespaces: namespaces = config.namespaces else: # namespaces, namespacenames = getNamespacesAPI(config=config, session=session) namespaces = [ALL_NAMESPACE] # magic number refers to "all" + + # <- increasement xmldump + if env_arvcontinue := os.getenv("ARVCONTINUE", None): + mark_as_done(config, XMLRIVISIONS_INCREMENTAL_DUMP_MARK) + print(f"Using [env]ARVCONTINUE={env_arvcontinue}") + arvcontinue = env_arvcontinue + print("\n\n[NOTE] DO NOT use wikiteam3uploader to upload incremental xmldump to Internet Archive, we haven't implemented it yet\n\n") + # -> + _nscontinue_input = nscontinue _arvcontinue_input = arvcontinue del nscontinue diff --git a/wikiteam3/tools/get_arvcontinue.py b/wikiteam3/tools/get_arvcontinue.py new file mode 100644 index 00000000..77b7f42e --- /dev/null +++ b/wikiteam3/tools/get_arvcontinue.py @@ -0,0 +1,21 @@ +import argparse + +from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import parse_last_page_chunk, truncateXMLDump + +def parse_args(): + parser = argparse.ArgumentParser(description="Get the next arvcontinue value") + parser.add_argument("xml", help="XML file") + args = parser.parse_args() + return args + +def main(): + args = parse_args() + xmlfile: str = args.xml + lastPageChunk = truncateXMLDump(xmlfile, dryrun=True) + lastPage = parse_last_page_chunk(lastPageChunk) + assert lastPage is not None + lastArvcontinue = lastPage.attrib['arvcontinue'] + print(f'ARVCONTINUE="{lastArvcontinue}"') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/wikiteam3/uploader/uploader.py b/wikiteam3/uploader/uploader.py index 66b2bbd3..f66d9df2 100644 --- a/wikiteam3/uploader/uploader.py +++ b/wikiteam3/uploader/uploader.py @@ -25,7 +25,7 @@ from wikiteam3.utils import url2prefix_from_config, sha1sum from wikiteam3.uploader.compresser import ZstdCompressor, SevenZipCompressor from wikiteam3.utils.ia_checker import ia_s3_tasks_load_avg -from wikiteam3.utils.util import ALL_DUMPED_MARK, UPLOADED_MARK, is_empty_dir, mark_as_done, is_markfile_exists +from wikiteam3.utils.util import ALL_DUMPED_MARK, UPLOADED_MARK, XMLRIVISIONS_INCREMENTAL_DUMP_MARK, is_empty_dir, mark_as_done, is_markfile_exists DEFAULT_COLLECTION = 'opensource' TEST_COLLECTION = 'test_collection' @@ -384,6 +384,7 @@ def upload(arg: Args): assert wikidump_dir == Path(config.path).resolve() assert is_markfile_exists(config, ALL_DUMPED_MARK), "Imcomplete dump" + assert is_markfile_exists(config, XMLRIVISIONS_INCREMENTAL_DUMP_MARK), "xmlrevisions incremental dump is not supported yet" if is_markfile_exists(config, UPLOADED_MARK): print(f"Already uploaded to IA ({UPLOADED_MARK} exists), bye!") return diff --git a/wikiteam3/utils/util.py b/wikiteam3/utils/util.py index e1c9b1a0..d9e06e98 100644 --- a/wikiteam3/utils/util.py +++ b/wikiteam3/utils/util.py @@ -9,6 +9,7 @@ ALL_DUMPED_MARK = "all_dumped.mark" UPLOADED_MARK = 'uploaded_to_IA.mark' +XMLRIVISIONS_INCREMENTAL_DUMP_MARK = 'xmlrevisions_incremental_dump.mark' def underscore(text: str) -> str: