-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathautoSB.py
1342 lines (1270 loc) · 61.1 KB
/
autoSB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
"""
autoSB.py
By: Emily Sturdivant, [email protected]
Last modified: 1/10/17
OVERVIEW: Functions used in sb_automation.py
"""
#%% Import packages
import sciencebasepy as pysb
import os
import sys
import shutil
import glob
from lxml import etree
import json
import pickle
from datetime import datetime
import time
import io
import re
__all__ = ['splitall', 'splitall2', 'remove_files', 'trunc', 'replace_in_file',
'get_title_from_data', 'get_root_flexibly', 'add_element_to_xml', 'fix_attrdomv_error',
'remove_xml_element', 'replace_element_in_xml', 'map_newvals2xml',
'find_and_replace_text', 'find_and_replace_from_dict',
'update_xml_tagtext', 'flip_dict', 'update_xml', 'update_all_xmls', 'json_from_xml',
'get_fields_from_xml', 'log_in', 'log_in2', 'flexibly_get_item',
'get_DOI_from_item', 'fix_falsefolder', 'rename_dirs_from_xmls', 'setup_subparents', 'inherit_SBfields', 'find_or_create_child',
'upsert_metadata', 'replace_files_by_ext', 'upload_files', 'upload_files_matching_xml',
'upload_shp', 'find_browse_in_json', 'update_browse', 'update_all_browse_graphics', 'upload_all_updated_xmls', 'get_parent_bounds', 'get_idlist_bottomup',
'set_parent_extent', 'find_browse_file', 'upload_all_previewImages2', 'upload_all_previewImages', 'shp_to_new_child',
'update_datapage', #'update_subpages_from_landing',
'get_pageid_from_xmlpath',
'update_pages_from_XML_and_landing', 'remove_all_files',
'update_existing_fields',
'delete_all_children', 'remove_all_child_pages',
'check_fields', 'check_fields2', 'check_fields3', 'check_fields2_topdown',
'landing_page_from_parentdir', 'inherit_topdown',
'apply_topdown', 'apply_bottomup', 'restore_original_xmls']
#%% Functions
def splitall(path):
allparts = []
while 1:
parts = os.path.split(path)
if parts[0] == path: # sentinel for absolute paths
allparts.insert(0, parts[0])
break
elif parts[1] == path: # sentinel for relative paths
allparts.insert(0, parts[1])
break
else:
path = parts[0]
allparts.insert(0, parts[1])
return(allparts)
def splitall2(path):
# lists directory paths, each item ends with one more directory on the path
allparts2 = []
while 1:
parts = os.path.split(path)
if parts[0] == path: # sentinel for absolute paths
allparts2.insert(0, path)
break
elif parts[1] == path: # sentinel for relative paths
allparts2.insert(0, path)
break
else:
allparts2.insert(0, path)
path = parts[0]
return(allparts2)
def remove_files(parentdir, pattern='**/*.xml_orig'):
# Recursively remove files matching pattern
xmllist = glob.glob(os.path.join(parentdir, pattern), recursive=True)
for xml_file in xmllist:
os.remove(xml_file)
return(parentdir)
###################################################
#
# Work with XML
#
###################################################
def trunc(string, length=40):
string = (string[:length-3] + '...') if len(string) > length else string
return(string)
def replace_in_file(fname, fstr, rstr, fill='xxx'):
with io.open(fname, 'r', encoding='utf-8') as f:
s = f.read()
s, ct = re.subn(fstr, rstr, s)
print("Replaced values matching '{}': {}.".format(trunc(fstr), ct))
ct_fills = len(re.findall('(?i){}'.format(fill), s)) # Count remaining xxx values
if ct_fills > 0:
print("Found {} '{}' fills remaining.".format(ct_fills, fill))
with io.open(fname, 'w', encoding='utf-8') as f:
f.write(s)
return(fname)
def get_title_from_data(xml_file, metadata_root=False):
try:
if not metadata_root:
tree = etree.parse(xml_file) # parse metadata using etree
metadata_root=tree.getroot()
title = metadata_root.findall('./idinfo/citation/citeinfo/title')[0].text # Get title of child from XML
return title
except Exception as e:
print("Exception while trying to parse XML file ({}): {}".format(xml_file, e), file=sys.stderr)
return False
def get_root_flexibly(in_metadata):
# Whether in_metadata is a filename or an element, get metadata_root
# in_metadata accepts either xml file or root element of parsed metadata.
if type(in_metadata) is etree._Element:
metadata_root = in_metadata
tree = False
xml_file =False
elif type(in_metadata) is str:
xml_file = in_metadata
try:
tree = etree.parse(xml_file) # parse metadata using etree
except etree.XMLSyntaxError as e:
print("XML Syntax Error while trying to parse XML file: {}".format(e))
return False
except Exception as e:
print("Exception while trying to parse XML file: {}".format(e))
return False
metadata_root=tree.getroot()
else:
print("{} is not an accepted variable type for 'in_metadata'".format(in_metadata))
return(metadata_root, tree, xml_file)
def add_element_to_xml(in_metadata, new_elem, containertag='./idinfo'):
# Appends element 'new_elem' to 'containertag' in XML file. in_metadata accepts either xmlfile or root element of parsed metadata. new_elem accepts either lxml._Element or XML string
# Whether in_metadata is a filename or an element, get metadata_root
# FIXME: Check whether element already exists
metadata_root, tree, xml_file = get_root_flexibly(in_metadata)
# If new element is still a string convert it to an XML element
if type(new_elem) is str:
new_elem = etree.fromstring(new_elem)
elif not type(new_elem) is etree._Element:
raise TypeError("'new_elem' takes either strings or elements.")
# Append new_elem to containertag element
elem = metadata_root.findall(containertag)[0]
elem.append(new_elem) # append new tag to container element
# Either overwrite XML file with new XML or return the updated metadata_root
if type(xml_file) is str:
tree.write(xml_file)
return(xml_file)
else:
return(metadata_root)
def fix_attrdomv_error(in_metadata, verbose=False):
# Fix attrdomv so that each has only one subelement
metadata_root, tree, xml_file = get_root_flexibly(in_metadata)
attrdomv = metadata_root.findall('./eainfo/detailed/attr/attrdomv')
for elem in attrdomv:
subelems = elem.getchildren()
if len(subelems) > 1:
if verbose:
print('fixing error in Attribute Domain Values...')
parent = elem.getparent()
for child in subelems:
new_elem = parent.makeelement(elem.tag)
new_elem.insert(0, child)
parent.append(new_elem)
parent.remove(elem)
if type(xml_file) is str:
tree.write(xml_file)
return xml_file
else:
return metadata_root
def remove_xml_element(in_metadata, path='./', fill_text=['AUTHOR']):
# Remove XML elements in path that contain fill text
""" Example:
tree = etree.parse(xml_file)
metadata_root = tree.getroot()
metadata_root = remove_xml_element(metadata_root)
tree.write(xml_file)
"""
# get metadata root
metadata_root, tree, xml_file = get_root_flexibly(in_metadata)
# get fill_text as list of strings
if type(fill_text) is str:
fill_text = [fill_text]
elif not type(fill_text) is list:
print('fill_text must be string or list')
raise(Exception)
# Search the matching tags for fill_text and remove all elements in which it is found.
container, tag = os.path.split(path)
parent_elem = metadata_root.find(container)
for elem in parent_elem.iter(tag):
for text in elem.itertext():
for ftext in fill_text:
if ftext in text:
parent_elem.remove(elem)
# Either overwrite XML file with new XML or return the updated metadata_root
if type(xml_file) is str:
tree.write(xml_file)
return(xml_file)
else:
return(metadata_root)
def replace_element_in_xml(in_metadata, new_elem, containertag='./distinfo'):
# Overwrites the first element in containertag corresponding to the tag of new_elem
# in_metadata accepts either xml file or root element of parsed metadata.
# new_elem accepts either lxml._Element or XML string
# Whether in_metadata is a filename or an element, get metadata_root
metadata_root, tree, xml_file = get_root_flexibly(in_metadata)
# If new element is still a string convert it to an XML element
if type(new_elem) is str:
new_elem = etree.fromstring(new_elem)
elif not type(new_elem) is etree._Element:
raise TypeError("'new_elem' takes either strings or elements.")
# Replace element with new_elem
elem = metadata_root.findall(containertag)[0]
old_elem = elem.findall(new_elem.tag)[0]
elem.replace(old_elem, new_elem)
# Either overwrite XML file with new XML or return the updated metadata_root
if type(xml_file) is str:
tree.write(xml_file)
return(xml_file)
else:
return(metadata_root)
def replace_element_in_xml_for_wrapper(metadata_root, new_elem, containertag='./distinfo'):
if type(new_elem) is str:
new_elem = etree.fromstring(new_elem)
elif not type(new_elem) is etree._Element:
raise TypeError("'new_elem' takes either strings or elements.")
# Replace element with new_elem
elem = metadata_root.findall(containertag)[0]
old_elem = elem.findall(new_elem.tag)[0]
elem.replace(old_elem, new_elem)
return metadata_root
def xml_write_wrapper(in_metadata, new_elem, containertag='./distinfo'):
# FIXME: I don't actually know how to make a wrapper. This is completely unchecked.
# in_metadata accepts either xml file or root element of parsed metadata.
# Whether in_metadata is a filename or an element, get metadata_root
metadata_root, tree, xml_file = get_root_flexibly(in_metadata)
# if type(in_metadata) is etree._Element:
# metadata_root = in_metadata
# xml_file =False
# elif type(in_metadata) is str:
# xml_file = in_metadata
# tree = etree.parse(xml_file) # parse metadata using etree
# metadata_root=tree.getroot()
# else:
# print("{} is not an accepted variable type for 'in_metadata'".format(in_metadata))
# If new element is still a string convert it to an XML element
replace_element_in_xml_for_wrapper(metadata_root, new_elem, containertag)
# Either overwrite XML file with new XML or return the updated metadata_root
if type(xml_file) is str:
tree.write(xml_file)
return(xml_file)
else:
return(metadata_root)
def map_newvals2xml(new_values):
# Create dictionary of {new value: {XPath to element: position of element in list retrieved by XPath}}
"""
To update XML elements with new text:
for newval, elemfind in val2xml.items():
for elempath, i in elemfind.items():
metadata_root.findall(elempath)[i].text = newval
Currently hard-wired; will need to be adapted to match metadata scheme.
"""
# Hard-wire path in metadata to each element
seriesid = './idinfo/citation/citeinfo/serinfo/issue' # Citation / Series / Issue Identification
citelink = './idinfo/citation/citeinfo/onlink' # Citation / Online Linkage
lwork_link = './idinfo/citation/citeinfo/lworkcit/citeinfo/onlink' # Larger Work / Online Linkage
lwork_serID = './idinfo/citation/citeinfo/lworkcit/citeinfo/serinfo/issue' # Larger Work / Series / Issue Identification
lwork_pubdate = './idinfo/citation/citeinfo/lworkcit/citeinfo/pubdate' # Larger Work / Publish date
edition = './idinfo/citation/citeinfo/edition' # Citation / Edition
pubdate = './idinfo/citation/citeinfo/pubdate' # Citation / Publish date
caldate = './idinfo/timeperd/timeinfo/sngdate/caldate'
networkr = './distinfo/stdorder/digform/digtopt/onlinopt/computer/networka/networkr' # Network Resource Name
accinstr = './distinfo/stdorder/digform/digtopt/onlinopt/accinstr'
metadate = './metainfo/metd' # Metadata Date
browsen = './idinfo/browse/browsen'
# Initialize storage dictionary
val2xml = {}
# DOI values
if 'doi' in new_values.keys():
# get DOI values (as issue and URL)
doi_issue = "DOI:{}".format(new_values['doi'])
doi_url = "https://doi.org/{}".format(new_values['doi'])
# add new DOI values as {DOI:XXXXX:{'./idinfo/.../issue':0}}
val2xml[doi_issue] = {seriesid:0, lwork_serID:0}
val2xml[doi_url] = {citelink: 0, lwork_link: 0, networkr: 2}
# Landing URL
if 'landing_id' in new_values.keys():
landing_link = 'https://www.sciencebase.gov/catalog/item/{}'.format(new_values['landing_id'])
val2xml[landing_link] = {lwork_link: 1}
# Data page URL
if 'child_id' in new_values.keys():
# get URLs
page_url = 'https://www.sciencebase.gov/catalog/item/{}'.format(new_values['child_id']) # data_item['link']['url']
directdownload_link = 'https://www.sciencebase.gov/catalog/file/get/{}'.format(new_values['child_id'])
# add values
val2xml[page_url] = {citelink: 1, networkr: 0}
val2xml[directdownload_link] = {networkr:1}
access_str = 'The first link is to the page containing the data. The second is a direct link to download all data available from the page as a zip file. The final link is to the publication landing page. The data page (first link) may have additional data access options, including web services.'
val2xml[access_str] = {accinstr: 0}
# Browse graphic
if 'browse_file' in new_values.keys():
browse_link = '{}/?name={}'.format(directdownload_link, new_values['browse_file'])
val2xml[browse_link] = {browsen:0}
# Edition
if 'edition' in new_values.keys():
val2xml[new_values['edition']] = {edition:0}
if 'pubdate' in new_values.keys():
val2xml[new_values['pubdate']] = {pubdate:0, lwork_pubdate:0} # removed caldate
# Date and time of update
now_str = datetime.now().strftime("%Y%m%d")
val2xml[now_str] = {metadate: 0}
return(val2xml)
def find_and_replace_text(fname, findstr='http:', replacestr='https:'):
os.rename(fname, fname+'.tmp')
with open(fname+'.tmp', 'r') as f1:
with open(fname, 'w') as f2:
for line in f1:
f2.write(line.replace(findstr, replacestr))
os.remove(fname+'.tmp')
return fname
def find_and_replace_from_dict(fname, find_dict):
# Takes dictionary of {find_value: replace_value}
ct = 0
with io.open(fname, 'r', encoding='utf-8') as f:
s = f.read()
# Iterate through find:replace pairs
for fstr, rstr in find_dict.items():
s = s.replace(fstr, rstr)
with io.open(fname, 'w', encoding='utf-8') as f:
f.write(s)
return(fname)
def update_xml_tagtext(metadata_root, newval, fstr='./distinfo', idx=0):
# Add or update the values of each element
try:
metadata_root.findall(fstr)[idx].text = newval
except IndexError: # if the element does not yet exist, create the element
try:
container, tag = os.path.split(fstr)
elem = metadata_root.find(container)
elem.append(etree.Element(tag))
metadata_root.findall(fstr)[idx].text = newval
except Exception as e:
print('Exception raised: {}'.format(e))
pass
except Exception as e:
print('Exception raised: {}'.format(e))
pass
def flip_dict(in_dict, verbose=False):
# convert nested dictionary structure
# rework the dictionary to {tag fstring: {index: new value}}
out_dict = {}
for newval, elemfind in in_dict.items(): # Update elements with new ID text
for fstr, idx in elemfind.items():
if not fstr in out_dict:
if verbose:
print(fstr)
out_dict[fstr] = {idx: newval}
else:
if verbose:
print(' {}: {}'.format(idx, newval))
out_dict[fstr][idx] = newval
return(out_dict)
def update_xml(xml_file, new_values, verbose=False):
# update XML file to include new child ID and DOI
#%% Map new values to their appropriate metadata elements
e2nv = map_newvals2xml(new_values)
e2nv_flipped = flip_dict(e2nv, verbose=False)
#%% Update the XML with the new values
# Save the original xml_file if an original is not already present
if not os.path.exists(xml_file+'_orig'):
shutil.copy(xml_file, xml_file+'_orig')
# Parse metadata
metadata_root, tree, xml_file = get_root_flexibly(xml_file)
# Update elements with new text values
for fstr, idx_val in e2nv_flipped.items():
for idx in sorted(idx_val):
newval = idx_val[idx]
# Update elements with new text value
update_xml_tagtext(metadata_root, newval, fstr, idx)
#%% Modify XML as programmed in config file.
if "remove_fills" in new_values:
[remove_xml_element(metadata_root, path, ftext) for path, ftext in new_values['remove_fills'].items()]
if "metadata_additions" in new_values:
[add_element_to_xml(metadata_root, new_elem, containertag) for containertag, new_elem in new_values['metadata_additions'].items()]
if "metadata_replacements" in new_values:
[replace_element_in_xml(metadata_root, new_elem, containertag) for containertag, new_elem in new_values['metadata_replacements'].items()]
#%% Fix common error in which attrdomv has multiple subelements
metadata_root = fix_attrdomv_error(metadata_root)
#%% Save changes - overwrite XML file with new XML
tree.write(xml_file)
#%% Perform find and replace the text in the file
if "find_and_replace" in new_values:
find_and_replace_from_dict(xml_file, new_values['find_and_replace'])
return(xml_file)
def update_all_xmls(parentdir, new_values, sb=None, dict_DIRtoID=None, verbose=True):
# Update every XML in the directory tree with new values (from config file and SB)
# Does not upload resulting XML to SB.
xmllist = glob.glob(os.path.join(parentdir, '**/*.xml'), recursive=True)
for xml_file in xmllist:
# Update XML
# Get SB values
datapageid = get_pageid_from_xmlpath(xml_file, sb, dict_DIRtoID, parentdir=parentdir, verbose=False)
# add SB UID to be updated in XML
new_values['child_id'] = datapageid
# Look for browse graphic in directory with XML
datadir = os.path.dirname(xml_file)
browse_file = find_browse_file(datadir)
new_values.pop('browse_file', None) # remove value from past iteration
if browse_file:
new_values['browse_file'] = browse_file
# Make the changes to the XML based on the new_values dictionary
update_xml(xml_file, new_values, verbose=verbose) # new_values['pubdate']
if verbose:
print("UPDATED XML: {}".format(xml_file))
return
def json_from_xml():
#FIXME: Currently hard-wired; will need to adapted to match metadata scheme.
dict_xml2sb = dict()
#dict_xml2sb['citation'] =
dict_xml2sb['purpose'] = {'./idinfo/descript/purpose':0}
dict_xml2sb['summary'] = {'./idinfo/descript/abstract':0}
dict_xml2sb['body'] = {'./idinfo/descript/abstract':0}
return dict_xml2sb
def get_fields_from_xml(sb, item, xml_file, sbfields, metadata_root=False):
# Based on desired SB fields, get text values from XML
if not metadata_root:
tree = etree.parse(xml_file) # parse metadata using etree
metadata_root=tree.getroot()
dict_sb_from_xml = json_from_xml() # return dict for locating values in XML
for field in sbfields:
elemfind = dict_sb_from_xml[field]
for fstr,i in elemfind.items():
try:
item[field] = metadata_root.findall(fstr)[i].text
except:
pass
item = sb.updateSbItem(item)
return item
###################################################
#
# SB helper functions
#
###################################################
def log_in(username=None, password=None):
print('Logging in if necessary...')
if 'sb' in globals():
if not sb.is_logged_in():
print('Logging back in...')
else:
return sb
if not username:
username = input("SB username (should be entire USGS email): ")
if not password:
sb = pysb.SbSession(env=None).loginc(username)
else:
try:
sb = pysb.SbSession(env=None).login(username, password)
except Exception as e: # 'Login failed' returned as Exception for bad password in login()
print('{}. Try reentering...'.format(e))
sb = pysb.SbSession(env=None).loginc(username) # 'Invalid password, try again' printed for bad password
except NameError as e:
print('{}. Try reentering...'.format(e))
sb = pysb.SbSession(env=None).loginc(username)
return sb
def log_in2(username=False, password=False, sb=[]):
if not sb.is_logged_in():
print('Logging back in...')
try:
sb = pysb.SbSession(env=None).login(username, password)
except NameError:
sb = pysb.SbSession(env=None).loginc(username)
return sb
def flexibly_get_item(sb, id_or_json, output='item'):
# Given input of either ID or JSON, return ID, link, or JSON item
if type(id_or_json) is str: # If input is ID...
item_id = id_or_json
if output.lower() == 'item' or output.lower() == 'url':
item = sb.get_item(item_id)
elif type(id_or_json) is dict: # If input is JSON...
item = id_or_json
item_id = item['id']
# Return ID, URL, or JSON item (default)
if output.lower() == 'id':
return item_id
elif output.lower() == 'url':
item_link = item['link']['url']
return item_link
else:
return item
def get_DOI_from_item(item):
# Get DOI link from parent_item
doi = False
i = 0
try:
weblinks = item['webLinks']
except:
print("No 'webLinks' in JSON for {}.".format(item['id']))
return False
while not doi:
doi = doi[-16:] if 'doi' in weblinks[i]['uri'].lower() else False
i += 1
return doi
def fix_falsefolder(sb, falsefolder_id, useremail, password):
# Occasionally a child page is falsely identified as a folder even though it doesn't have children. In that case, this should fix it. 1. Add a child page to the false folder. 2. Delete the child. 3. remove the 'Folder'
sb = log_in(useremail, password)
child_item = sb.create_item({'parentId': falsefolder_id, 'title':'meow'})
sb.delete_item(child_item)
falsefolder_item = sb.get_item(falsefolder_id)
falsefolder_item['systemTypes'].remove('Folder')
falsefolder_item = sb.update_item(falsefolder_item)
if falsefolder_item['hasChildren']:
print("'hasChildren' successfully set to False. ")
if 'Folder' in falsefolder_item['systemTypes']:
print("Still recognized as a folder... :(")
return
def rename_dirs_from_xmls(parentdir, rename_intermediates=True):
"""
Rename directories that contain a single XML. Rename the directory to the title in the XML file.
Notes:
- I have run this on OSX with colons in the titles without issue. The renamed directories have backslash instead of colon, but the ScienceBase page titles have colons even though they are copied from the directory names.
- Use rename_intermediates to specify whether directory must not contain sub-directories.
- Although colons don't seem to be a problem on Mac, parentheses might be. Don't include parentheses in titles.
"""
# List characters that are invalid pathnames in OSX (:) or Windows (the rest).
invalid_chars = r'< > : " / \ | ? *'.split(' ')
# Initialize
ct = 0
invalid_titles = []
# For all XML files...
xmllist = glob.glob(os.path.join(parentdir, '**/*.xml'), recursive=True)
for xml_file in xmllist:
datadir = os.path.dirname(xml_file)
# Check whether directory meets conditions: contains only one XML and (optionally) does not contain subdirectories.
go = False
if [len(glob.glob(os.path.join(datadir, '**/*.xml'), recursive=True)) == 1]:
go = True
# and doesn't contain sub-directories...
if rename_intermediates and any([os.path.isdir(os.path.join(datadir,fn)) for fn in os.listdir(datadir)]):
go = False
# If directory meets the conditions, rename it.
if go:
data_title = get_title_from_data(xml_file)
data_title = data_title.strip('\n')
# Rename if the values don't already match
basedir = os.path.dirname(datadir)
if not datadir == os.path.join(basedir, data_title):
# Check for invalid characters
if any(x in data_title for x in invalid_chars):
invalid_titles += [data_title]
os.rename(datadir, os.path.join(basedir, data_title))
ct+=1
if len(invalid_titles):
print('WARNING: The following titles include invalid characters. Consider changing:\n{}'.format('\n'.join(invalid_titles)))
print("Renamed {} directories.".format(ct))
return
def setup_subparents(sb, parentdir, landing_id, imagefile, verbose=True):
landing_item = sb.get_item(landing_id)
# Initialize dictionaries
dict_DIRtoID = {os.path.basename(parentdir): landing_id} # Initialize [top dir/file: ID] entry to dict
# List XML files
xmllist = glob.glob(os.path.join(parentdir, '**/*.xml'), recursive=True)
for xml_file in xmllist:
# get relative path from parentdir to XML, including parentdir
relpath = os.path.relpath(xml_file, os.path.dirname(parentdir))
# Find or create the SB page for each directory
dirchain = splitall2(os.path.dirname(relpath))
for dirpath in dirchain[1:]:
parent_id = dict_DIRtoID[os.path.dirname(dirpath)] # get ID for parent
if dirpath in dict_DIRtoID:
continue
subpage = find_or_create_child(sb, parent_id, os.path.basename(dirpath), verbose=verbose) # get JSON for subpage based on parent ID and dirpath
if not imagefile == False:
subpage = sb.upload_file_to_item(subpage, imagefile)
# store values in dictionaries
dict_DIRtoID[dirpath] = subpage['id']
return(dict_DIRtoID)
def inherit_SBfields(sb, child_item, inheritedfields=['citation'], verbose=False, inherit_void=True):
# Upsert inheritedfield from parent to child by retrieving parent_item based on child
# Modified 3/8/17: if field does not exist in parent, remove in child
# If field is entered incorrecly, no errors will be thrown, but the page will not be updated.
parent_item = flexibly_get_item(sb, child_item['parentId'])
if verbose:
print("Inheriting fields from parent '{}'".format(trunc(parent_item['title'])))
for field in inheritedfields:
if not field in parent_item:
if inherit_void:
child_item[field] = None
else:
print("Field '{}' does not exist in parent and inherit_void is set to False so the current value will be preserved in child '{}'.".format(field, trunc(child_item['title'])))
else:
try:
child_item[field] = parent_item[field]
except Exception as e:
print(e)
pass
child_item = sb.update_item(child_item)
return(child_item)
def find_or_create_child(sb, parentid, child_title, verbose=False):
# Find or create new child page
for child_id in sb.get_child_ids(parentid): # Check if child page already exists
child_item = sb.get_item(child_id)
if child_item['title'] == child_title:
if verbose:
print("FOUND: page '{}'.".format(trunc(child_title)))
break
else: # If child doesn't already exist, create
child_item = {}
child_item['parentId'] = parentid
child_item['title'] = child_title
child_item = sb.create_item(child_item)
if verbose:
print("CREATED PAGE: '{}' in '{}.'".format(trunc(child_title, 40), sb.get_item(parentid)['title']))
time.sleep(1) # wait 1 sec to ensure that page is registered
return child_item
def get_file_upload_time(data_item, file_type='application/fgdc+xml'):
time_uploaded = None
if 'files' in data_item:
for fl_idx, fl_json in enumerate(data_item['files']):
if fl_json['contentType'] == file_type:
time_uploaded = fl_json['dateUploaded']
break
# If the browse graphic got included in a facet (or extension?)
if not time_uploaded and 'facets' in data_item:
for fc_idx, facet in enumerate(data_item['facets']):
for fl_idx, fl_json in enumerate(facet['files']):
if fl_json['contentType'] == file_type:
time_uploaded = fl_json['dateUploaded']
break
if time_uploaded:
break
if not time_uploaded:
print("No XML found.")
return(time_uploaded)
def upsert_metadata(sb, id_or_item, xml_file):
# Remove file with the originalMetadata flag. Then upload the xml file.
data_item = flexibly_get_item(sb, id_or_item, output='item')
found_metadata = False
if 'files' in data_item:
for fidx, file in enumerate(data_item['files']):
if file['originalMetadata']:
data_item['files'].pop(fidx)
found_metadata = True
if 'facets' in data_item:
for fc_idx, facet in enumerate(data_item['facets']):
for fl_idx, file in enumerate(facet['files']):
if file['originalMetadata']:
data_item['facets'][fc_idx]['files'].pop(fl_idx)
found_metadata = True
if found_metadata:
data_item = sb.update_item(data_item)
data_item = sb.upload_file_to_item(data_item, xml_file)
return(data_item)
#%% Update SB preview image from the uploaded files.
def update_all_browse_graphics(sb, parentdir, landing_id, valid_ids=None, verbose=False):
# Update SB preview image from the uploaded files and update filename and type in XML.
# For every XML in the parentdir (recursive)...
print("Updating browse graphic information...")
xmllist = glob.glob(os.path.join(parentdir, '**/*.xml'), recursive=True)
for xml_file in xmllist:
# Get SB page ID from the XML (needs to be up-to-date)
datapageid = get_pageid_from_xmlpath(xml_file, sb, valid_ids=valid_ids, parentdir=parentdir, verbose=verbose)
# Run update_browse() to match the XML values with the image file on the SB page. Get browse caption from the XML. Get name of *browse* image file on SB and set as preview. Update the filename and type in the XML.
if update_browse(sb, xml_file, datapageid, verbose):
# if the XML was updated, replace the XML on the page.
data_item = upsert_metadata(sb, datapageid, xml_file)
return
def upload_all_updated_xmls(sb, parentdir, valid_ids=None):
# Upload XMLs that have been updated since last upload to SB.
# Iterates through local XMLs rather than starting on SB
ct = 0
xmllist = glob.glob(os.path.join(parentdir, '**/*.xml'), recursive=True)
print("Searching {} XML files for changes since last upload...".format(len(xmllist)))
for xml_file in xmllist:
# Get SB JSON item that corresponds to XML file (try matching folder name to SB or get second link in XML citeinfo) # Get page_id from the SB title or the SB citation in the XML file.
datapageid = get_pageid_from_xmlpath(xml_file, sb, valid_ids=valid_ids, parentdir=parentdir)
data_item = flexibly_get_item(sb, datapageid, output='item')
# Get upload time of XML as UTC datetime object
xml_uploaded = get_file_upload_time(data_item, file_type='application/fgdc+xml')
xml_uploaded = datetime.strptime(xml_uploaded, '%Y-%m-%dT%H:%M:%SZ')
# Get modified time of local XML as UTC datetime
xml_modified = datetime.utcfromtimestamp(os.path.getmtime(xml_file))
# Replace the metadata file if the modified time is greater than the uploaded time
if xml_modified > xml_uploaded:
data_item = upsert_metadata(sb, data_item, xml_file)
# print('UPLOADED: {}'.format(os.path.basename(xml_file)))
ct += 1
if ct > 0:
print("Found and uploaded {} XML files.\n".format(ct))
else:
print("No XMLs have been updated since last upload.\n")
return
def replace_files_by_ext(sb, parentdir, dict_DIRtoID, match_str='*.xml', verbose=True):
for root, dirs, files in os.walk(parentdir):
for d in dirs:
path = os.path.join(root, d)
reldirpath = os.path.relpath(path, os.path.dirname(parentdir))
xmllist = glob.glob(os.path.join(path, match_str))
for xml_file in xmllist:
parentid = dict_DIRtoID[reldirpath]
data_title = get_title_from_data(xml_file) # get title from XML
data_item = find_or_create_child(sb, parentid, data_title, verbose=verbose) # Create (or find) data page based on title
sb.replace_file(xml_file, data_item)
print("REPLACED: {}".format(os.path.basename(xml_file)))
return
def upload_files(sb, item, xml_file, max_MBsize=2000, replace=True, verbose=False):
# Upload all files in the directory to SB page.
if replace:
# Remove all files (and facets) from child page
item = remove_all_files(sb, item, verbose)
# List all files in directory, except original xml and other bad apples
datadir = os.path.dirname(xml_file)
up_files = [os.path.join(datadir, fn) for fn in os.listdir(datadir)
if not fn.endswith('_orig')
and not fn.endswith('DS_Store')
and not fn.endswith('.lock')
and os.path.isfile(os.path.join(datadir, fn))]
bigfiles = []
for fn in up_files:
if os.path.getsize(fn) > max_MBsize*1000000: # convert megabytes to bytes
bigfiles.append(os.path.basename(fn))
up_files.remove(fn)
# Upload all files to child page
if verbose:
start = datetime.now()
print("UPLOADING {}: files in directory '{}'".format(start.strftime("%X"), os.path.basename(datadir)))
if len(bigfiles) == 1:
print("**TO DO** File {} is too big to upload here. Please manually upload afterward.".format(bigfiles))
elif len(bigfiles)>1:
print("**TO DO** Files {} are too big to upload here. Please manually upload afterward.".format(bigfiles))
item = sb.upload_files_and_upsert_item(item, up_files) # upsert should "create or update a SB item"
if verbose:
end = datetime.now()
duration = end - start
print("UPLOAD COMPLETED. Duration: {}".format(duration))
return(item, bigfiles)
def upload_files_matching_xml(sb, item, xml_file, max_MBsize=2000, replace=True, verbose=False):
# Upload all files matching the XML filename to SB page.
# E.g. xml_file = 'path/data_name.ext.xml' will upload all files beginning with 'data_name'
# optionally remove all present files
if replace:
# Remove all files (and facets) from child page
item = remove_all_files(sb, item, verbose)
# List all files matching XML
dataname = xml_file.split('.')[0]
dataname = dataname.split('_meta')[0]
# up_files = glob.glob(searchstr)
up_files = [fn for fn in glob.iglob(dataname + '*')
if not fn.endswith('_orig') and not os.path.isdir(fn)]
bigfiles = []
for f in up_files:
if os.path.getsize(f) > max_MBsize*1000000: # convert megabytes to bytes
bigfiles.append(os.path.basename(f))
up_files.remove(f)
# Upload all files pertaining to data to child page
if verbose:
print("UPLOADING: files matching '{}'".format(os.path.basename(dataname + '*')))
if len(bigfiles)>0 and len(bigfiles)<2:
print("**TO DO** File {} is too big to upload here. Please manually upload afterward.".format(bigfiles))
elif len(bigfiles)>1:
print("**TO DO** Files {} are too big to upload here. Please manually upload afterward.".format(bigfiles))
item = sb.upload_files_and_upsert_item(item, up_files) # upsert should "create or update a SB item"
if verbose:
print("UPLOAD COMPLETED.")
return item, bigfiles
def upload_shp(sb, item, xml_file, replace=True, verbose=False):
# Upload shapefile files to SB page, optionally remove all present files
data_name = os.path.splitext(os.path.basename(xml_file))[0]
datapath = os.path.dirname(xml_file)
if replace:
# Remove all files (and facets) from child page
item = remove_all_files(sb, item, verbose)
# List files pertaining to shapefile for upload
shp_exts = ['.cpg','.dbf','.prj','.sbn','.sbx','.shp','.shx','dbf.xml','.shp.xml']
up_files = []
# Upload all files pertaining to data to child page
for ext in shp_exts:
fname = '{}{}'.format(os.path.splitext(data_name)[0],ext)
if os.path.isfile(os.path.join(datapath,fname)):
up_files.append(os.path.join(datapath,fname))
# Upload files
if verbose:
print('UPLOADING: {} ...'.format(data_name))
item = sb.upload_files_and_upsert_item(item, up_files) # upsert should "create or update a SB item"
return item
def find_browse_in_json(data_item, browse_desc=None, verbose=True):
# Look for browse graphic among the uploaded files
# data_item = sb.get_item(page_id)
browse_image = None
if 'files' in data_item:
for fl_idx, fl_json in enumerate(data_item['files']):
# Find the uploaded file that includes 'browse' and has an extension of png, gif, or jpg
# ALTERNATIVE: could also use the 'contentType' property to match 'image/png', 'image/jpg', or 'image/gif'. This is not more flexible to typos in extensions.
match = re.fullmatch('^.*browse.*\.(png|gif|jpg|jpeg)$', fl_json['name'])
if match:
# If it matches the pattern, set useForPreview to True and add the browse description from the XML as the title
browse_image = match[0]
data_item['files'][fl_idx]['useForPreview'] = True
data_item['files'][fl_idx]['title'] = browse_desc
break
# If the browse graphic got included in a facet (or extension?)
if not browse_image and 'facets' in data_item:
for fc_idx, facet in enumerate(data_item['facets']):
for fl_idx, fl_json in enumerate(facet['files']):
# Find the uploaded file that includes 'browse' and has an extension of png, gif, or jpg
match = re.fullmatch('^.*browse.*\.(png|gif|jpg|jpeg)$', fl_json['name'])
if match:
# If it matches the pattern, set useForPreview to True and add the browse description from the XML as the title
browse_image = match[0]
data_item['facets'][fc_idx]['files'][fl_idx]['useForPreview'] = True
data_item['facets'][fc_idx]['files'][fl_idx]['title'] = browse_desc
break
if match:
break
if not browse_image:
print('No browse image found in the uploaded files.')
else:
if verbose:
print('browse filename: {}'.format(browse_image))
return(data_item, browse_image)
def update_browse(sb, in_metadata, page_id, verbose=True):
# Match the XML values with the image file on the SB page.
# 1. Parse the XML file and get the caption.
# 2. Find a *browse* image file in the SB page: set the 'useForPreview' to True and get the filename
# 3. Update the browse filename and browse type in the XML.
# Get the caption from the metadata
metadata_root, tree, xml_file = get_root_flexibly(in_metadata)
browse_desc = metadata_root.findall('./idinfo/browse/browsed')[0].text
browse_desc = trunc(browse_desc, 80)
# Set the browse image as previewImage and get the filename from SB
data_item = sb.get_item(page_id)
if verbose:
print(data_item['title'])
data_item, browse_image = find_browse_in_json(data_item, browse_desc, verbose)
if not browse_image:
# If the browse image wasn't found, there's no point continuing
return(False)
# Update the SB page with 'useForPreview'=True
data_item = sb.update_item(data_item)
# Update browsen in XML to the get-file URL and browset to the type...
browset = os.path.splitext(browse_image)[1][1:].upper()
sb_urlstr_fileget = 'https://www.sciencebase.gov/catalog/file/get/{}/?name={}'
browse_link = sb_urlstr_fileget.format(page_id, browse_image)
old_link = metadata_root.findall('./idinfo/browse/browsen')[0].text
old_t = metadata_root.findall('./idinfo/browse/browset')[0].text
if old_link == browse_link and old_t == browset:
return
metadata_root.findall('./idinfo/browse/browsen')[0].text = browse_link
metadata_root.findall('./idinfo/browse/browset')[0].text = browset
# Either overwrite XML file with new XML or return the updated metadata_root
if type(in_metadata) is str:
tree.write(xml_file)
print("Updated XML: {}".format(os.path.basename(xml_file)))
return(True)
else:
return(metadata_root)
def get_parent_bounds(sb, parent_id, verbose=False):
# UPDATED 9/6/17: added "and i < len(kids)", changed 1 to i in second loop, and added "if not parent_bounds: parent_bounds = bbox"
item = sb.get_item(parent_id)
kids = sb.get_child_ids(parent_id)
if len(kids) > 0:
# Initialize parent_bounds with first child
i = 0
found = False
while not found and i < len(kids): # stop when bounding box is found in item or when there are no item left to search
try:
child = sb.get_item(kids[i])
except:
i += 1
found = False
if 'facets' in child:
parent_bounds = child['facets'][0]['boundingBox']
found = True
elif 'spatial' in child:
parent_bounds = child['spatial']['boundingBox']
found = True
else:
i += 1
print("Child item '{}'' does not have 'spatial' or 'facets' fields.".format(child['title']))
if len(kids) > i:
# Loop through kids
for cid in kids[i:]:
child = sb.get_item(cid)
if 'facets' in child:
bbox = child['facets'][0]['boundingBox'] # {u'minX': -81.43, u'minY': 28.374, u'maxX': -80.51, u'maxY': 30.70}
elif 'spatial' in child:
bbox = child['spatial']['boundingBox']
else:
continue
if not parent_bounds: # if the first step didn't find a parent, set parent_bounds to current
parent_bounds = bbox
for corner in parent_bounds:
if 'min' in corner:
parent_bounds[corner] = min(bbox[corner], parent_bounds[corner])
if 'max' in corner:
parent_bounds[corner] = max(bbox[corner], parent_bounds[corner])
# Update parent bounding box
if 'parent_bounds' in locals():
try:
item['spatial']['boundingBox'] = parent_bounds
except KeyError:
if parent_bounds:
item['spatial'] = {}
item['spatial']['boundingBox'] = parent_bounds
item = sb.update_item(item)
if verbose:
print('Updated bounding box for parent "{}"'.format(item['title']))
else:
parent_bounds = {}
return parent_bounds
def get_idlist_bottomup(sb, top_id):
tier1 = sb.get_child_ids(top_id)
tier2 = []
for t1 in tier1:
tier2 += sb.get_child_ids(t1)
tier3 = []
for t2 in tier2:
tier3 += sb.get_child_ids(t2)
idlist_bottomup = tier3 + tier2 + tier1
idlist_bottomup.append(top_id)
return idlist_bottomup
def set_parent_extent(sb, top_id, verbose=False):
pagelist = get_idlist_bottomup(sb,top_id)
for page in pagelist:
parent_bounds = get_parent_bounds(sb, page, verbose)
return parent_bounds
def find_browse_file(datadir, searchterm='*browse*', extensions=('.png', '.jpg', '.jpeg', '.gif')):
imagelist = []
for ext in extensions:
imagelist.extend(glob.glob(os.path.join(datadir, searchterm + ext)))
if len(imagelist) > 0:
browse_file = os.path.basename(imagelist[0])
return(browse_file)
print("Note: No {} image files found in the directory.".format(searchterm))
return
def upload_all_previewImages(sb, parentdir, dict_DIRtoID=False, verbose=False):
# Upload all image files to their respective pages.
# 1. find all image files in folder tree
# 2. for each image, try to upload it
for (root, dirs, files) in os.walk(parentdir):
for d in dirs:
imagelist = glob.glob(os.path.join(root,d,'*browse*.png'))
imagelist.extend(glob.glob(os.path.join(root,d,'*browse*.jpg')))
imagelist.extend(glob.glob(os.path.join(root,d,'*browse*.jpeg')))
imagelist.extend(glob.glob(os.path.join(root,d,'*browse*.gif')))
reldirpath = os.path.join(os.path.relpath(root, os.path.dirname(parentdir)), d)
for f in imagelist:
# sb = log_in(useremail)
try:
item = sb.get_item(dict_DIRtoID[reldirpath])
except:
title = d # dirname should correspond to page title
item = sb.find_items_by_title(title)['items'][0]
if verbose:
print('UPLOADING: preview image to "{}"...\n\n'.format(d))