-
Notifications
You must be signed in to change notification settings - Fork 22
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
cms-2016-simulated-datasets: work in progress
- Loading branch information
1 parent
0828d0c
commit bb77295
Showing
10 changed files
with
1,202 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# cms-2016-simulated-datasets | ||
|
||
This directory contains helper scripts used to prepare CMS 2016 open data | ||
release regarding MC simulated datasets. | ||
|
||
|
||
- `code/` folder contains the python code. | ||
- `inputs/` folder contains input text files with the list of datasets for each | ||
year and input files. | ||
|
||
Every step necessary to produce the final `*.json` files is handled by the | ||
`cmc-mc/interface.py` script. Details about it can be queried with the command: | ||
|
||
```console | ||
$ python3 code/interface.py --help | ||
``` | ||
|
||
Make sure to start voms-proxy before creating cache | ||
```console | ||
$ voms-proxy-init --voms cms --rfc --valid 190:00 | ||
``` | ||
|
||
Set the eos path with | ||
|
||
```console | ||
$ export EOS_MGM_URL=root://eospublic.cern.ch | ||
``` | ||
|
||
Warning: creating the full local cache might take a long time! | ||
|
||
First step is to create EOS file index cache: | ||
|
||
```console | ||
$ python3 ./code/interface.py --create-eos-indexes ../cms-YYYY-simulated-datasets/inputs/CMS-2016-mc-datasets.txt | ||
``` | ||
|
||
This requires the file to be in place in their final location. | ||
|
||
For early testing, on lxplus, all steps can be run without the EOS file index cache with the flag `--ignore-eos-store`. | ||
|
||
To build sample records (with a limited number of datasets in the input file) do the following: | ||
|
||
|
||
```console | ||
$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store DATASET_LIST | ||
|
||
$ auth-get-sso-cookie -u https://cms-pdmv.cern.ch/mcm -o cookies.txt | ||
$ python3 ./code/interface.py --create-mcm-json-store --ignore-eos-store DATASET_LIST | ||
|
||
$ openssl pkcs12 -in myCert.p12 -nocerts -nodes -out userkey.nodes.pem # if not present | ||
$ python3 ./code/interface.py --get-conf-files --ignore-eos-store DATASET_LIST | ||
|
||
$ python3 code/lhe_generators.py | ||
|
||
$ python3 ./code/interface.py --create-records --ignore-eos-store DATASET_LIST | ||
$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store DATASET_LIST | ||
``` | ||
|
||
Note that to build the test records an (empty) input file for DOI's and a recid info file must be present in the inputs directory. | ||
Each step builds a subdirectory with a cache (`das-json-store`, `mcm-store` and `config-store`). They are large, do not upload them to the repository. | ||
|
||
The output json file for dataset records go to the `outputs` directory. | ||
|
||
|
||
## lhe_generators | ||
|
||
|
||
```console | ||
python3 code/lhe_generators.py 2> errors > output & | ||
``` | ||
- This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt` | ||
- It works on lxplus or with mounted EOS | ||
- number of threads is set to 20 which is ideal for lxplus | ||
|
||
> :warning: There are many cases with various steps to get generator parameters for LHE -see [#97](https://github.com/cernopendata/data-curation/issues/97)-. Thus, in some few cases, the script MIGHT not work as expected so make sure to read it, check errors, and make any necessary tweaks |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
#!/usr/bin/env python | ||
|
||
|
||
""" | ||
Create MC 2012 records. | ||
""" | ||
|
||
import hashlib | ||
import json | ||
import re | ||
import os | ||
import subprocess | ||
import sys | ||
from urllib.request import urlopen | ||
|
||
from utils import get_from_deep_json, \ | ||
populate_doiinfo, \ | ||
get_dataset_format, \ | ||
get_dataset_year, \ | ||
get_author_list_recid, \ | ||
get_doi | ||
from das_json_store import get_das_store_json, \ | ||
get_parent_dataset | ||
from eos_store import XROOTD_URI_BASE, \ | ||
get_dataset_index_file_base, \ | ||
get_dataset_location | ||
from mcm_store import get_mcm_dict, \ | ||
get_global_tag, \ | ||
get_genfragment_url, \ | ||
get_generator_name, \ | ||
get_dataset_energy, \ | ||
get_cmsDriver_script | ||
from config_store import get_conffile_ids | ||
from categorisation import guess_title_category | ||
from dataset_records import get_dataset, \ | ||
newer_dataset_version_exists | ||
|
||
|
||
def create_record(conf_id, conffiles_dir): | ||
"""Create record for the given dataset.""" | ||
|
||
rec = {} | ||
|
||
with open(conffiles_dir + '/' + conf_id + '.configFile') as myfile: | ||
#print(conf_id) | ||
rec['cms_confdb_id'] = conf_id | ||
rec['script'] = myfile.read() | ||
|
||
return rec | ||
|
||
|
||
def create_records(conf_ids, conffiles_dir): | ||
"""Create records.""" | ||
|
||
records = [] | ||
for conf_id in conf_ids: | ||
records.append(create_record(conf_id, conffiles_dir)) | ||
return records | ||
|
||
|
||
def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir): | ||
"Do the job." | ||
|
||
dataset_full_names = [] | ||
for dataset_full_name in datasets: | ||
if newer_dataset_version_exists(dataset_full_name, datasets): | ||
print('[ERROR] Ignoring older dataset version ' + dataset_full_name, | ||
file=sys.stderr) | ||
else: | ||
dataset_full_names.append(dataset_full_name) | ||
|
||
conffiles = [] | ||
for ds in dataset_full_names: | ||
#config_ids = get_conffile_ids(ds, das_dir, mcm_dir), using mcm for now, add the step loop, ds list is not enough | ||
# get_conffile_ids in config_store has the loop inside, use that | ||
config_ids = get_conffile_ids(ds, mcm_dir) | ||
if config_ids: | ||
for config_id in config_ids: | ||
if config_id not in conffiles: | ||
conffiles.append(config_id) | ||
|
||
records = create_records(conffiles, conffiles_dir) | ||
json.dump(records, indent=2, sort_keys=True, ensure_ascii=True, fp=sys.stdout) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import os | ||
import subprocess | ||
import sys | ||
|
||
from eos_store import check_datasets_in_eos_dir | ||
from mcm_store import get_conffile_ids_from_mcm | ||
from utils import get_from_deep_json | ||
|
||
|
||
def get_conffile_ids_all_chain_steps(dataset, mcm_dir): | ||
"""Return location of the configuration files for the dataset.""" | ||
ids = {} | ||
path = mcm_dir + '/chain/' + dataset.replace('/', '@') | ||
step_dirs = os.listdir(path) | ||
for step in step_dirs: | ||
step_dir = path + '/' + step | ||
mcm_config_ids = get_conffile_ids_from_mcm(dataset, step_dir) | ||
|
||
for someid in mcm_config_ids: | ||
ids[someid] = 1 | ||
|
||
return list(ids.keys()) | ||
|
||
|
||
def main(eos_dir, | ||
mcm_dir, | ||
conf_dir, | ||
datasets, | ||
ignore_eos_store): | ||
"Do the job" | ||
|
||
# only for the datasets with EOS file information | ||
if ignore_eos_store: | ||
eos_datasets = datasets.copy() | ||
else: | ||
eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir) | ||
|
||
conffile_ids = [] | ||
for dataset_full_name in eos_datasets: | ||
for conffile_id in get_conffile_ids_all_chain_steps(dataset_full_name, mcm_dir): | ||
if conffile_id not in conffile_ids: | ||
conffile_ids.append(conffile_id) | ||
|
||
if not os.path.exists(conf_dir): | ||
os.makedirs(conf_dir, exist_ok=True) | ||
|
||
key_nodes = "~/.globus/userkey.nodes.pem" | ||
cert = "~/.globus/usercert.pem" | ||
|
||
total = len(conffile_ids) | ||
i = 1 | ||
for conffile_id in conffile_ids: | ||
filepath="{}/{}.configFile".format(conf_dir, conffile_id) | ||
if os.path.exists(filepath) and os.stat(filepath).st_size != 0: | ||
print("==> " + conffile_id + ".configFile\n==> Already exist. Skipping...") | ||
i += 1 | ||
continue | ||
|
||
print("Getting ({}/{}) {}/{}.configFile".format(i, total, conf_dir, conffile_id)) | ||
|
||
cmd = "curl -s -k --key {key} --cert {cert} https://cmsweb.cern.ch/couchdb/reqmgr_config_cache/{conffile_id}/configFile".format(conffile_id=conffile_id, key=key_nodes, cert=cert) | ||
conffile = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
|
||
confs = conffile.stdout.decode("utf-8") | ||
if confs: | ||
with open(filepath, 'w') as outfile: | ||
outfile.write(confs) | ||
else: | ||
print("[ERROR] Empty conf file for {ds}".format(ds=conffile_id), file=sys.stderr) | ||
|
||
i += 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
import json | ||
import os | ||
import subprocess | ||
import sys | ||
import threading | ||
from time import sleep | ||
|
||
from eos_store import check_datasets_in_eos_dir | ||
from utils import get_dataset_name, get_from_deep_json | ||
|
||
|
||
def get_parent_dataset(dataset, das_dir): | ||
"Return parent dataset to the given dataset or an empty string if no parent found. Not used for 2016" | ||
parent_dataset = '' | ||
|
||
filepath = das_dir + '/parent/' + dataset.replace('/', '@') + '.json' | ||
|
||
if os.path.exists(filepath) and os.stat(filepath).st_size != 0: | ||
parent_dataset = get_from_deep_json(get_das_store_json(dataset, 'parent', das_dir), 'parent_dataset') | ||
return parent_dataset | ||
|
||
|
||
def get_das_store_json(dataset, query='dataset', das_dir=''): | ||
"Return DAS JSON from the DAS JSON Store for the given dataset and given query." | ||
|
||
if not dataset: | ||
print('[ERROR] There is no DAS JSON store', query, 'for dataset', dataset, | ||
file=sys.stderr) | ||
return json.loads('{}') | ||
|
||
filepath = das_dir + '/' + query + '/' + dataset.replace('/', '@') + '.json' | ||
if os.path.exists(filepath) and os.stat(filepath).st_size != 0: | ||
with open(filepath, 'r') as filestream: | ||
return json.load(filestream) | ||
else: | ||
print('[ERROR] There is no DAS JSON store ' + query + ' for dataset ' + dataset, | ||
file=sys.stderr) | ||
return json.loads('{}') | ||
|
||
|
||
def mydasgoclient(dataset, query, out_dir, out_file): | ||
"Interface to dasgoclient" | ||
|
||
out = out_dir + '/' + query + '/' + out_file | ||
if os.path.exists(out) and os.stat(out).st_size != 0: | ||
print('==> {:<9} {}'.format(query, dataset) + | ||
'\n==> File already exist, skipping...\n') | ||
return | ||
|
||
print('\t{:<9} {}'.format(query, dataset)) | ||
|
||
cmd = 'dasgoclient -query "' | ||
if query != "dataset": | ||
cmd += query + ' ' | ||
cmd += 'dataset=' + dataset + '" -json' | ||
|
||
|
||
das = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
|
||
if das.returncode == 16: # ???? | ||
print("[Error] in ", cmd, file=sys.stderr) | ||
print(das.stderr.decode("utf-8"), "\n", file=sys.stderr) | ||
else: | ||
das_out = das.stdout.decode("utf-8") | ||
if das_out: | ||
with open(out, 'w') as dasfile: | ||
dasfile.write(das_out) | ||
else: | ||
print("[ERROR] Empty DAS {query} for {ds}".format(query=query, ds=dataset), | ||
file=sys.stderr) | ||
|
||
|
||
def create(dataset, das_dir): | ||
|
||
result_file = dataset.replace('/', '@') + ".json" | ||
mydasgoclient(dataset, "dataset", das_dir, result_file) | ||
mydasgoclient(dataset, "config", das_dir, result_file) | ||
mydasgoclient(dataset, "release", das_dir, result_file) | ||
|
||
|
||
def main(das_dir, | ||
eos_dir, | ||
datasets, | ||
ignore_eos_store): | ||
"Do the job." | ||
|
||
# create dirs for dataset and release | ||
for path in [das_dir + '/dataset', | ||
das_dir + '/config', | ||
das_dir + '/release']: | ||
if not os.path.exists(path): | ||
os.makedirs(path) | ||
|
||
# only for the datasets with EOS file information | ||
if ignore_eos_store: | ||
eos_datasets = datasets.copy() | ||
else: | ||
eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir) | ||
|
||
total = len(eos_datasets) | ||
i = 1 | ||
for dataset in eos_datasets: | ||
print("dasgoclienting ({}/{})".format(i, total), dataset) | ||
t = threading.Thread(target=create, args=(dataset, das_dir)) | ||
t.start() | ||
while threading.activeCount() >= 100 : | ||
sleep(0.5) # run 100 dasgoclient commands in parallel | ||
i += 1 | ||
|
||
|
||
def get_generator_parameters(dataset, das_dir): | ||
"""Return generator parameters dictionary for given dataset. Not used in 2016""" | ||
# TODO get from mcm store instead? | ||
# and/or from xsecDB | ||
out = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir), | ||
'generator_parameters') | ||
if out: | ||
return out[0] | ||
else: | ||
return {} | ||
|
||
|
||
def get_cmssw_version_from_das(dataset, das_dir): | ||
"""Return CMSSW release version from DAS JSON. Not used in 2016""" | ||
out = get_from_deep_json(get_das_store_json(dataset, 'release', das_dir), | ||
'name') | ||
if out: | ||
return out[0] | ||
else: | ||
return {} |
Oops, something went wrong.