Skip to content

Commit

Permalink
Merged
Browse files Browse the repository at this point in the history
Add support to backup pages using API:Query instead of Special:Export
WikiTeam#280

into recent version of dumpgenerator.py

Adds additional parameter: --apiexport
which uses a query request instead of submit on api.php which works
without Special:Export which is disabled on some sites.
  • Loading branch information
GERZAC1002 committed Apr 11, 2022
1 parent 054397a commit 5e1978e
Showing 1 changed file with 243 additions and 9 deletions.
252 changes: 243 additions & 9 deletions dumpgenerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,21 @@
from kitchen.text.converters import getwriter, to_unicode
except ImportError:
print "Please install the kitchen module."

try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET

import xml.dom.minidom as MD

import cookielib
import cPickle
import datetime
import sys
import io
import traceback

try:
import argparse
except ImportError:
Expand Down Expand Up @@ -63,7 +74,7 @@
UTF8Writer = getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)

__VERSION__ = '0.4.0-alpha' # major, minor, micro: semver.org
__VERSION__ = '0.5.0-alpha' # major, minor, micro: semver.org

class PageMissingError(Exception):
def __init__(self, title, xml):
Expand Down Expand Up @@ -164,7 +175,7 @@ def getNamespacesScraper(config={}, session=None):
namespacenames = {0: ''} # main is 0, no prefix
if namespaces:
r = session.post(
url=config['index'], params={'title': 'Special:Allpages'}, timeout=30)
url=config['index'], params={'title': 'Special:Allpages'}, timeout=120)
raw = r.text
delay(config=config, session=session)

Expand Down Expand Up @@ -206,7 +217,7 @@ def getNamespacesAPI(config={}, session=None):
'meta': 'siteinfo',
'siprop': 'namespaces',
'format': 'json'},
timeout=30
timeout=120
)
result = getJSON(r)
delay(config=config, session=session)
Expand Down Expand Up @@ -281,7 +292,7 @@ def getPageTitlesScraper(config={}, session=None):
print ' Retrieving titles in the namespace', namespace
url = '%s?title=Special:Allpages&namespace=%s' % (
config['index'], namespace)
r = session.get(url=url, timeout=30)
r = session.get(url=url, timeout=120)
raw = r.text
raw = cleanHTML(raw)

Expand Down Expand Up @@ -455,7 +466,7 @@ def getXMLHeader(config={}, session=None):

else:
try:
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
Expand All @@ -477,7 +488,7 @@ def getXMLHeader(config={}, session=None):
)
config['export'] = json.loads(r.text)['query']['namespaces']['-1']['*'] \
+ ':Export'
xml = "".join([x for x in getXMLPage(config=config, title=randomtitle, verbose=False, session=session)])
xml = "".join([x for x in getXMLPage_(config=config, title=randomtitle, verbose=False, session=session)])
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
Expand All @@ -500,7 +511,7 @@ def getXMLHeader(config={}, session=None):
def getXMLFileDesc(config={}, title='', session=None):
""" Get XML for image description page """
config['curonly'] = 1 # tricky to get only the most recent desc
return("".join([x for x in getXMLPage( config=config, title=title, verbose=False, session=session)]))
return("".join([x for x in getXMLPage_( config=config, title=title, verbose=False, session=session)]))


def getUserAgent():
Expand All @@ -521,7 +532,216 @@ def logerror(config={}, text=''):
output = u'%s: %s\n' % (
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), text)
outfile.write(output.encode('utf-8'))
def reconstructRevisions(root=None):
#print ET.tostring(rev)
page = ET.Element('stub')
edits = 0
for rev in root.find('query').find('pages').find('page').find('revisions').findall('rev'):
try:
rev_ = ET.SubElement(page,'revision')
ET.SubElement(rev_,'id').text = rev.attrib['revid']
ET.SubElement(rev_,'timestamp').text = rev.attrib['timestamp']
contributor = ET.SubElement(rev_,'contributor')
if not rev.attrib.has_key('userhidden'):
ET.SubElement(contributor,'username').text = rev.attrib['user']
ET.SubElement(contributor,'id').text = rev.attrib['userid']
else:
contributor.set('deleted','deleted')
comment = ET.SubElement(rev_,'comment')
if not rev.attrib.has_key('commenthidden'):
comment.text = rev.attrib['comment']
else:
comment.set('deleted','deleted')

# some revision does not return model and format, so just use hard-code
ET.SubElement(rev_,'model').text = 'wikitext'
ET.SubElement(rev_,'format').text = 'text/x-wiki'
text = ET.SubElement(rev_,'text')
if not rev.attrib.has_key('texthidden'):
text.attrib['xml:space'] = "preserve"
text.attrib['bytes'] = rev.attrib['size']
text.text = rev.text
else:
text.set('deleted','deleted')
# delete sha1 here :)
#sha1 = ET.SubElement(rev_,'sha1')
#if not rev.attrib.has_key('sha1missing'):
#sha1.text = rev.attrib['sha1']
if rev.attrib.has_key('minor'):
ET.SubElement(rev_,'minor')
edits += 1
except Exception as e:
#logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
print ET.tostring(rev)
traceback.print_exc()
page = None
edits = 0
raise e
return page,edits

def getXMLPageCoreWithApi(headers={}, params={}, config={}, session=None):
""" """
# just send the API request
# if it fails, it will reduce params['rvlimit']
xml = ''
c = 0
maxseconds = 100 # max seconds to wait in a single sleeping
maxretries = config['retries'] # x retries and skip
increment = 20 # increment every retry

while not re.search(r'</api>' if not config['curonly'] else r'</mediawiki>', xml) or re.search(r'</error>', xml):
if c > 0 and c < maxretries:
wait = increment * c < maxseconds and increment * \
c or maxseconds # incremental until maxseconds
print ' In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'%(c, params['titles' if config['apiexport'] else 'pages'], wait)
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# rvlimit = 1 from mother function)
if params['rvlimit'] > 1:
params['rvlimit'] = params['rvlimit'] / 2 # half
if c >= maxretries:
print ' We have retried %d times' % (c)
print ' MediaWiki error for "%s", network error or whatever...' % (params['titles' if config['apiexport'] else 'pages'])
# If it's not already what we tried: our last chance, preserve only the last revision...
# config['curonly'] means that the whole dump is configured to save only the last,
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
# TODO: save only the last version when failed
print ' Saving in the errors log, and skipping...'
logerror(
config=config,
text=u'Error while retrieving the last revision of "%s". Skipping.' %
(params['titles' if config['apiexport'] else 'pages']).decode('utf-8'))
#raise ExportAbortedError(config['index'])
return '' # empty xml

# FIXME HANDLE HTTP Errors HERE
try:
r = session.get(url=config['api'], params=params, headers=headers)
handleStatusCode(r)
xml = fixBOM(r)
#print xml
except requests.exceptions.ConnectionError as e:
print ' Connection error: %s'%(str(e[0]))
xml = ''
c += 1
return xml
def getXMLPageWithApi(config={}, title='', verbose=True, session=None):
""" Get the full history (or current only) of a page using API:Query
if params['curonly'] is set, then using export&exportwrap to export
"""

title_ = title
title_ = re.sub(' ', '_', title_)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
# action=query&rvlimit=50&format=xml&prop=revisions&titles=TITLE_HERE
# &rvprop=timestamp%7Cuser%7Ccomment%7Ccontent%7Cids%7Cuserid%7Csha1%7Csize
#print 'current:%s' % (title_)
if not config['curonly']:
params = {'titles': title_, 'action': 'query','format':'xml',
'prop':'revisions',
'rvprop' : 'timestamp|user|comment|content|ids|userid|sha1|size|flags',
'rvcontinue' : None,
'rvlimit' : 10 # TODO: set this by commandline
}
else:
params = {'titles': title_, 'action': 'query','format':'xml','export':1,'exportnowrap':1}
#print 'params:%s' % (params)
if not config['curonly']:
firstpartok = False
lastcontinue = None
numberofedits = 0
ret = ''
while True:
# in case the last request is not right, saving last time's progress
if not firstpartok:
try:
lastcontinue = params['rvcontinue']
except:
lastcontinue = None

xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
#just return so that we can continue, and getXMLPageCoreWithApi will log the error
return
try:
root = ET.fromstring(xml.encode('utf-8'))
except:
continue
try:
retpage = root.find('query').find('pages').find('page')
except:
continue
if retpage.attrib.has_key('missing') or retpage.attrib.has_key('invalid'):
print 'Page not found'
raise PageMissingError(params['titles'], xml)
if not firstpartok:
try:
# build the firstpart by ourselves to improve the memory usage
ret = ' <page>\n'
ret += ' <title>%s</title>\n' %(retpage.attrib['title'])
ret += ' <ns>%s</ns>\n' % (retpage.attrib['ns'])
ret += ' <id>%s</id>\n' % (retpage.attrib['pageid'])
except:
firstpartok = False
continue
else:
firstpartok = True
yield ret
try:
ret = ''
edits = 0
if config['curonly'] or root.find('continue') == None:
# transform the revision
rev_,edits = reconstructRevisions(root=root)
xmldom = MD.parseString('<stub1>'+ET.tostring(rev_)+'</stub1>')
# convert it into text in case it throws MemoryError
# delete the first three line and last two line,which is for setting the indent
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
yield ret
numberofedits += edits
break
else:
rev_,edits = reconstructRevisions(root=root)
xmldom = MD.parseString('<stub1>' + ET.tostring(rev_) + '</stub1>')
ret += ''.join(xmldom.toprettyxml(indent=' ').splitlines(True)[3:-2])
params['rvcontinue'] = root.find('continue').attrib['rvcontinue']
numberofedits += edits
yield ret
except:
traceback.print_exc()
params['rvcontinue'] = lastcontinue
ret = ''
yield ' </page>\n'
else:
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config['index'])
if not "</page>" in xml:
raise PageMissingError(params['titles'], xml)
else:
# strip these sha1s sums which keep showing up in the export and
# which are invalid for the XML schema (they only apply to
# revisions)
xml = re.sub(r'\n\s*<sha1>\w+</sha1>\s*\n', r'\n', xml)
xml = re.sub(r'\n\s*<sha1/>\s*\n', r'\n', xml)

yield xml.split("</page>")[0]

# just for looking good :)
r_timestamp = r'<timestamp>([^<]+)</timestamp>'

numberofedits = 0
numberofedits += len(re.findall(r_timestamp, xml))

yield "</page>\n"

if verbose:
if (numberofedits == 1):
print ' %s, 1 edit' % (title.strip())
else:
print ' %s, %d edits' % (title.strip(), numberofedits)

def getXMLPageCore(headers={}, params={}, config={}, session=None):
""" """
Expand Down Expand Up @@ -694,7 +914,13 @@ def getXMLPage(config={}, title='', verbose=True, session=None):
print ' %s, 1 edit' % (title.strip())
else:
print ' %s, %d edits' % (title.strip(), numberofedits)

def getXMLPage_(config={}, title='', verbose=True, session=None):
#print config
if config['apiexport']:
return getXMLPageWithApi(config=config, title=title, verbose=verbose, session=session)
else:
return getXMLPage(config=config, title=title, verbose=verbose, session=session)
return ''

def makeXmlPageFromRaw(xml):
""" Discard the metadata around a <page> element in <mediawiki> string"""
Expand Down Expand Up @@ -775,7 +1001,7 @@ def generateXMLDump(config={}, titles=[], start=None, session=None):
if c % 10 == 0:
print 'Downloaded %d pages' % (c)
try:
for xml in getXMLPage(config=config, title=title, session=session):
for xml in getXMLPage_(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml.encode('utf-8'))
except PageMissingError:
Expand Down Expand Up @@ -1680,6 +1906,7 @@ def getParameters(params=[]):
action='store_true',
help='resumes previous incomplete dump (requires --path)')
parser.add_argument('--force', action='store_true', help='')
parser.add_argument('--ignore-api-check', action='store_true', help='')
parser.add_argument(
'--user', help='Username if authentication is required.')
parser.add_argument(
Expand Down Expand Up @@ -1723,6 +1950,10 @@ def getParameters(params=[]):
'--exnamespaces',
metavar="1,2,3",
help='comma-separated value of namespaces to exclude')
groupDownload.add_argument(
'--apiexport',
action='store_true',
help="Using API instead of Special:Export to export pages")

# Meta info params
groupMeta = parser.add_argument_group(
Expand Down Expand Up @@ -1824,6 +2055,8 @@ def getParameters(params=[]):
index2 = check[1]
api = checkedapi
print 'API is OK: ' + checkedapi
elif args.ignore_api_check:
print 'Error in API. Ignoring.'
else:
if index and not args.wiki:
print 'API not available. Trying with index.php only.'
Expand Down Expand Up @@ -1921,6 +2154,7 @@ def getParameters(params=[]):
'cookies': args.cookies or '',
'delay': args.delay,
'retries': int(args.retries),
'apiexport': args.apiexport
}

other = {
Expand Down

0 comments on commit 5e1978e

Please sign in to comment.