Merged

Add support to backup pages using API:Query instead of Special:Export WikiTeam#280 into recent version of dumpgenerator.py Adds additional parameter: --apiexport which uses a query request instead of submit on api.php which works without Special:Export which is disabled on some sites.
GERZAC1002 · Apr 11, 2022 · 5e1978e · 5e1978e
1 parent 054397a
commit 5e1978e
Showing 1 changed file with 243 additions and 9 deletions.
diff --git a/dumpgenerator.py b/dumpgenerator.py
@@ -23,10 +23,21 @@
     from kitchen.text.converters import getwriter, to_unicode
 except ImportError:
     print "Please install the kitchen module."
+
+try:
+    import xml.etree.cElementTree as ET
+except ImportError:
+    import xml.etree.ElementTree as ET
+
+import xml.dom.minidom as MD
+
 import cookielib
 import cPickle
 import datetime
 import sys
+import io
+import traceback
+
 try:
     import argparse
 except ImportError:
@@ -63,7 +74,7 @@
 UTF8Writer = getwriter('utf8')
 sys.stdout = UTF8Writer(sys.stdout)
 
-__VERSION__ = '0.4.0-alpha'  # major, minor, micro: semver.org
+__VERSION__ = '0.5.0-alpha'  # major, minor, micro: semver.org
 
 class PageMissingError(Exception):
     def __init__(self, title, xml):
@@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None):
     namespacenames = {0: ''}  # main is 0, no prefix
     if namespaces:
         r = session.post(
-            url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
+            url=config['index'], params={'title': 'Special:Allpages'}, timeout=120)
         raw = r.text
         delay(config=config, session=session)
 
@@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None):
                 'meta': 'siteinfo',
                 'siprop': 'namespaces',
                 'format': 'json'},
-            timeout=30
+            timeout=120
         )
         result = getJSON(r)
         delay(config=config, session=session)
@@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None):
         print '    Retrieving titles in the namespace', namespace
         url = '%s?title=Special:Allpages&namespace=%s' % (
             config['index'], namespace)
-        r = session.get(url=url, timeout=30)
+        r = session.get(url=url, timeout=120)
         raw = r.text
         raw = cleanHTML(raw)
 
@@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None):
 
     else:
         try:
-            xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+            xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
         except PageMissingError as pme:
             # The <page> does not exist. Not a problem, if we get the <siteinfo>.
             xml = pme.xml
@@ -477,7 +488,7 @@ def getXMLHeader(config={}, session=None):
                     )
                     config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
                         + ':Export'
-                    xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
+                    xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
             except PageMissingError as pme:
                 xml = pme.xml
             except ExportAbortedError:
@@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None):
 def getXMLFileDesc(config={}, title='', session=None):
     """ Get XML for image description page """
     config['curonly'] = 1  # tricky to get only the most recent desc
-    return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
+    return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)]))
 
 
 def getUserAgent():
@@ -521,7 +532,216 @@ def logerror(config={}, text=''):
             output = u'%s: %s\n' % (
                 datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
             outfile.write(output.encode('utf-8'))
+def reconstructRevisions(root=None):
+    #print ET.tostring(rev)
+    page = ET.Element('stub')
+    edits = 0
+    for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
+        try:
+            rev_ = ET.SubElement(page,'revision')
+            ET.SubElement(rev_,'id').text = rev.attrib['revid']
+            ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
+            contributor = ET.SubElement(rev_,'contributor')
+            if not rev.attrib.has_key('userhidden'):
+                ET.SubElement(contributor,'username').text = rev.attrib['user']
+                ET.SubElement(contributor,'id').text = rev.attrib['userid']
+            else:
+                contributor.set('deleted','deleted')
+            comment = ET.SubElement(rev_,'comment')
+            if not rev.attrib.has_key('commenthidden'):
+                comment.text = rev.attrib['comment']
+            else:
+                comment.set('deleted','deleted')
+
+            # some revision does not return model and format, so just use hard-code
+            ET.SubElement(rev_,'model').text = 'wikitext'
+            ET.SubElement(rev_,'format').text = 'text/x-wiki'
+            text = ET.SubElement(rev_,'text')
+            if not rev.attrib.has_key('texthidden'):
+                text.attrib['xml:space'] = "preserve"
+                text.attrib['bytes'] = rev.attrib['size']
+                text.text = rev.text
+            else:
+                text.set('deleted','deleted')
+            # delete sha1 here :)
+            #sha1 = ET.SubElement(rev_,'sha1')
+            #if not rev.attrib.has_key('sha1missing'):
+                #sha1.text = rev.attrib['sha1']
+            if rev.attrib.has_key('minor'):
+                ET.SubElement(rev_,'minor')
+            edits += 1
+        except Exception as e:
+            #logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
+            print ET.tostring(rev)
+            traceback.print_exc()
+            page = None
+            edits = 0
+            raise e
+    return page,edits
+
+def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
+    """  """
+    # just send the API request
+    # if it fails, it will reduce params['rvlimit']
+    xml = ''
+    c = 0
+    maxseconds = 100  # max seconds to wait in a single sleeping
+    maxretries = config['retries']  # x retries and skip
+    increment = 20  # increment every retry
 
+    while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
+        if c > 0 and c < maxretries:
+            wait = increment * c < maxseconds and increment * \
+                c or maxseconds  # incremental until maxseconds
+            print '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait)
+            time.sleep(wait)
+            # reducing server load requesting smallest chunks (if curonly then
+            # rvlimit = 1 from mother function)
+            if params['rvlimit'] > 1:
+                params['rvlimit'] = params['rvlimit'] / 2  # half
+        if c >= maxretries:
+            print '    We have retried %d times' % (c)
+            print '    MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages'])
+            # If it's not already what we tried: our last chance, preserve only the last revision...
+            # config['curonly'] means that the whole dump is configured to save only the last,
+            # params['curonly'] should mean that we've already tried this
+            # fallback, because it's set by the following if and passed to
+            # getXMLPageCore
+            # TODO: save only the last version when failed
+            print '    Saving in the errors log, and skipping...'
+            logerror(
+                config=config,
+                text=u'Error while retrieving the last revision of "%s". Skipping.' %
+                (params['titles' if config['apiexport'] else 'pages']).decode('utf-8'))
+            #raise ExportAbortedError(config['index'])
+            return ''  # empty xml
+
+        # FIXME HANDLE HTTP Errors HERE
+        try:
+            r = session.get(url=config['api'], params=params, headers=headers)
+            handleStatusCode(r)
+            xml = fixBOM(r)
+            #print xml
+        except requests.exceptions.ConnectionError as e:
+            print '    Connection error: %s'%(str(e[0]))
+            xml = ''
+        c += 1
+    return xml
+def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
+    """ Get the full history (or current only) of a page using API:Query
+        if params['curonly'] is set, then using export&exportwrap to export
+    """
+
+    title_ = title
+    title_ = re.sub(' ', '_', title_)
+    # do not convert & into %26, title_ = re.sub('&', '%26', title_)
+    # action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
+    # &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
+    #print 'current:%s' % (title_)
+    if not config['curonly']:
+        params = {'titles': title_, 'action': 'query','format':'xml',
+            'prop':'revisions',
+            'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
+            'rvcontinue' : None,
+            'rvlimit' : 10 # TODO: set this by commandline
+        }
+    else:
+        params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
+    #print 'params:%s' % (params)
+    if not config['curonly']:
+        firstpartok = False
+        lastcontinue = None
+        numberofedits = 0
+        ret = ''
+        while True:
+            # in case the last request is not right, saving last time's progress
+            if not firstpartok:
+                try:
+                    lastcontinue = params['rvcontinue']
+                except:
+                    lastcontinue = None
+
+            xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+            if xml == "":
+                #just return so that we can continue, and getXMLPageCoreWithApi will log the error
+                return
+            try:
+                root = ET.fromstring(xml.encode('utf-8'))
+            except:
+                continue
+            try:
+                retpage = root.find('query').find('pages').find('page')
+            except:
+                continue
+            if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
+                print 'Page not found'
+                raise PageMissingError(params['titles'], xml)
+            if not firstpartok:
+                try:
+                    # build the firstpart by ourselves to improve the memory usage
+                    ret  = '  <page>\n'
+                    ret += '    <title>%s</title>\n' %(retpage.attrib['title'])
+                    ret += '    <ns>%s</ns>\n' % (retpage.attrib['ns'])
+                    ret += '    <id>%s</id>\n' % (retpage.attrib['pageid'])
+                except:
+                    firstpartok = False
+                    continue
+                else:
+                    firstpartok = True
+                    yield ret
+            try:
+                ret = ''
+                edits = 0
+                if config['curonly'] or root.find('continue') == None:
+                    # transform the revision
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
+                    # convert it into text in case it throws MemoryError
+                    # delete the first three line and last two line,which is for setting the indent
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    yield ret
+                    numberofedits += edits
+                    break
+                else:
+                    rev_,edits = reconstructRevisions(root=root)
+                    xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
+                    ret += ''.join(xmldom.toprettyxml(indent='  ').splitlines(True)[3:-2])
+                    params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
+                    numberofedits += edits
+                    yield ret
+            except:
+                traceback.print_exc()
+                params['rvcontinue'] = lastcontinue
+                ret = ''
+        yield '  </page>\n'
+    else:
+        xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
+        if xml == "":
+            raise ExportAbortedError(config['index'])
+        if not "</page>" in xml:
+            raise PageMissingError(params['titles'], xml)
+        else:
+            # strip these sha1s sums which keep showing up in the export and
+            # which are invalid for the XML schema (they only apply to
+            # revisions)
+            xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
+            xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)
+
+        yield xml.split("</page>")[0]
+
+        # just for looking good :)
+        r_timestamp = r'<timestamp>([^<]+)</timestamp>'
+
+        numberofedits = 0
+        numberofedits += len(re.findall(r_timestamp, xml))
+
+        yield "</page>\n"
+
+    if verbose:
+        if (numberofedits == 1):
+           print '    %s, 1 edit' % (title.strip())
+        else:
+           print '    %s, %d edits' % (title.strip(), numberofedits)
 
 def getXMLPageCore(headers={}, params={}, config={}, session=None):
     """  """
@@ -694,7 +914,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
            print '    %s, 1 edit' % (title.strip())
         else:
            print '    %s, %d edits' % (title.strip(), numberofedits)
-
+def getXMLPage_(config={}, title='', verbose=True, session=None):
+    #print config
+    if config['apiexport']:
+        return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
+    else:
+        return getXMLPage(config=config, title=title, verbose=verbose, session=session)
+    return ''
 
 def makeXmlPageFromRaw(xml):
     """ Discard the metadata around a <page> element in <mediawiki> string"""
@@ -775,7 +1001,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
             if c % 10 == 0:
                 print 'Downloaded %d pages' % (c)
             try:
-                for xml in getXMLPage(config=config, title=title, session=session):
+                for xml in getXMLPage_(config=config, title=title, session=session):
                     xml = cleanXML(xml=xml)
                     xmlfile.write(xml.encode('utf-8'))
             except PageMissingError:
@@ -1680,6 +1906,7 @@ def getParameters(params=[]):
         action='store_true',
         help='resumes previous incomplete dump (requires --path)')
     parser.add_argument('--force', action='store_true', help='')
+    parser.add_argument('--ignore-api-check', action='store_true', help='')
     parser.add_argument(
         '--user', help='Username if authentication is required.')
     parser.add_argument(
@@ -1723,6 +1950,10 @@ def getParameters(params=[]):
         '--exnamespaces',
         metavar="1,2,3",
         help='comma-separated value of namespaces to exclude')
+    groupDownload.add_argument(
+        '--apiexport',
+        action='store_true',
+        help="Using API instead of Special:Export to export pages")
 
     # Meta info params
     groupMeta = parser.add_argument_group(
@@ -1824,6 +2055,8 @@ def getParameters(params=[]):
         index2 = check[1]
         api = checkedapi
         print 'API is OK: ' + checkedapi
+    elif args.ignore_api_check:
+        print 'Error in API. Ignoring.'
     else:
         if index and not args.wiki:
             print 'API not available. Trying with index.php only.'
@@ -1921,6 +2154,7 @@ def getParameters(params=[]):
         'cookies': args.cookies or '',
         'delay': args.delay,
         'retries': int(args.retries),
+        'apiexport': args.apiexport
     }
 
     other = {