-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmongo_chunk.py
71 lines (54 loc) · 2.18 KB
/
mongo_chunk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/python
import argparse
from pymongo import MongoClient as Client
from bson import BSON
from bson import json_util
import json
import os
# mongo client
client = Client()
# script arguments
parser = argparse.ArgumentParser(description='Export a Mongo Collection into json chunks given an offset')
parser.add_argument('--db', metavar='d', type=str, help='specify a mongo db', dest='db')
parser.add_argument('--collection', metavar='c', type=str, help='specify a mongo collection', dest='collection')
parser.add_argument('--offset', metavar='O', type=int, help='specify an offset (default=1000)', dest='offset')
parser.add_argument('--prefix', metavar='o', type=str, help='specify an output prefix', dest='prefix')
parser.add_argument('--dir', metavar='D', type=str, help='specify an existing directory', dest='directory', default='')
args = parser.parse_args()
# variables
db = client[args.db]
collection = db[args.collection]
offset = args.offset
prefix = args.prefix
print 'Selected database: {}'.format(db)
print 'Selected collection: {}'.format(collection)
print 'Offset: {}'.format(offset)
last_offset = 0
def json_chunks(offset):
offset = offset
total_docs = collection.find().count()
if total_docs < offset:
print '! Error: offset is greater than number of total documents'
exit()
chunks = total_docs / offset
chunk_idx = 0
for i in range(0, total_docs + offset, offset):
if i >= total_docs:
print 'export successfully finished'
exit()
dump = None
docs = None
json_file_path = os.path.join('{}'.format(args.directory), '{}_chunk__{}.json'.format(args.prefix, chunk_idx))
print 'i: {}'.format(i)
print 'offset: {}'.format(i + offset)
print 'query: collection.find().skip({}).limit({})'.format(i, offset)
if i + offset > total_docs:
offset_diff = offset - total_docs
offset = offset - offset_diff
docs = collection.find().skip(i).limit(offset)
with open(json_file_path, 'w') as outfile:
dump = json.dumps([doc for doc in docs], sort_keys=False, indent=4, default=json_util.default)
outfile.write(dump)
chunk_idx += 1
print 'export successfully finished'
json_chunks(offset)