cms-2016-simulated-datasets: work in progress

cernopendata · Feb 5, 2024 · bb77295 · bb77295
1 parent 0828d0c
commit bb77295
Show file tree

Hide file tree

Showing 10 changed files with 1,202 additions and 3 deletions.
diff --git a/cms-2016-simulated-datasets/README.md b/cms-2016-simulated-datasets/README.md
@@ -0,0 +1,75 @@
+# cms-2016-simulated-datasets
+
+This directory contains helper scripts used to prepare CMS 2016 open data
+release regarding MC simulated datasets.
+
+
+- `code/` folder contains the python code.
+- `inputs/` folder contains input text files with the list of datasets for each
+  year and input files.
+
+Every step necessary to produce the final `*.json` files is handled by the
+`cmc-mc/interface.py` script. Details about it can be queried with the command:
+
+```console
+$ python3 code/interface.py --help
+```
+
+Make sure to start voms-proxy before creating cache 
+```console
+$ voms-proxy-init --voms cms --rfc --valid 190:00
+```
+
+Set the eos path with
+
+```console
+$ export EOS_MGM_URL=root://eospublic.cern.ch
+```
+
+Warning: creating the full local cache might take a long time!
+
+First step is to create EOS file index cache:
+
+```console
+$ python3 ./code/interface.py --create-eos-indexes ../cms-YYYY-simulated-datasets/inputs/CMS-2016-mc-datasets.txt
+```
+
+This requires the file to be in place in their final location.
+
+For early testing, on lxplus, all steps can be run without the EOS file index cache with the flag `--ignore-eos-store`.
+
+To build sample records (with a limited number of datasets in the input file) do the following:
+
+
+```console
+$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store DATASET_LIST
+
+$ auth-get-sso-cookie -u  https://cms-pdmv.cern.ch/mcm -o cookies.txt
+$ python3 ./code/interface.py --create-mcm-json-store --ignore-eos-store DATASET_LIST
+
+$ openssl pkcs12 -in myCert.p12 -nocerts -nodes -out userkey.nodes.pem # if not present
+$ python3 ./code/interface.py --get-conf-files --ignore-eos-store DATASET_LIST
+
+$ python3 code/lhe_generators.py
+
+$ python3 ./code/interface.py --create-records --ignore-eos-store DATASET_LIST
+$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store DATASET_LIST
+```
+
+Note that to build the test records an (empty) input file for DOI's and a recid info file must be present in the inputs directory.
+Each step builds a subdirectory with a cache (`das-json-store`, `mcm-store` and `config-store`). They are large, do not upload them to the repository.
+
+The output json file for dataset records go to the `outputs` directory.
+
+
+## lhe_generators
+
+
+```console
+python3 code/lhe_generators.py 2> errors > output &
+```
+- This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt`
+- It works on lxplus or with mounted EOS
+- number of threads is set to 20 which is ideal for lxplus
+
+> :warning:  There are many cases with various steps to get generator parameters for LHE -see [#97](https://github.com/cernopendata/data-curation/issues/97)-. Thus, in some few cases, the script MIGHT not work as expected so make sure to read it, check errors, and make any necessary tweaks 
diff --git a/cms-2016-simulated-datasets/code/conffiles_records.py b/cms-2016-simulated-datasets/code/conffiles_records.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+
+"""
+Create MC 2012 records.
+"""
+
+import hashlib
+import json
+import re
+import os
+import subprocess
+import sys
+from urllib.request import urlopen
+
+from utils import get_from_deep_json, \
+                  populate_doiinfo, \
+                  get_dataset_format, \
+                  get_dataset_year, \
+                  get_author_list_recid, \
+                  get_doi
+from das_json_store import get_das_store_json, \
+                           get_parent_dataset
+from eos_store import XROOTD_URI_BASE, \
+                      get_dataset_index_file_base, \
+                      get_dataset_location
+from mcm_store import get_mcm_dict, \
+                      get_global_tag, \
+                      get_genfragment_url, \
+                      get_generator_name, \
+                      get_dataset_energy, \
+                      get_cmsDriver_script
+from config_store import get_conffile_ids
+from categorisation import guess_title_category
+from dataset_records import get_dataset, \
+                            newer_dataset_version_exists
+
+
+def create_record(conf_id, conffiles_dir):
+    """Create record for the given dataset."""
+
+    rec = {}
+
+    with open(conffiles_dir + '/' + conf_id + '.configFile') as myfile:
+        #print(conf_id)
+        rec['cms_confdb_id'] = conf_id
+        rec['script'] = myfile.read()
+
+    return rec
+
+
+def create_records(conf_ids, conffiles_dir):
+    """Create records."""
+
+    records = []
+    for conf_id in conf_ids:
+        records.append(create_record(conf_id, conffiles_dir))
+    return records
+
+
+def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir):
+    "Do the job."
+
+    dataset_full_names = []
+    for dataset_full_name in datasets:
+        if newer_dataset_version_exists(dataset_full_name, datasets):
+            print('[ERROR] Ignoring older dataset version ' + dataset_full_name,
+                  file=sys.stderr)
+        else:
+            dataset_full_names.append(dataset_full_name)
+
+    conffiles = []
+    for ds in dataset_full_names:
+        #config_ids = get_conffile_ids(ds, das_dir, mcm_dir), using mcm for now, add the step loop, ds list is not enough
+        # get_conffile_ids in config_store has the loop inside, use that
+        config_ids = get_conffile_ids(ds, mcm_dir)
+        if config_ids:
+            for config_id in config_ids:
+                if config_id not in conffiles:
+                    conffiles.append(config_id)
+
+    records = create_records(conffiles, conffiles_dir)
+    json.dump(records, indent=2, sort_keys=True, ensure_ascii=True, fp=sys.stdout)
diff --git a/cms-2016-simulated-datasets/code/config_store.py b/cms-2016-simulated-datasets/code/config_store.py
@@ -0,0 +1,71 @@
+import os
+import subprocess
+import sys
+
+from eos_store import check_datasets_in_eos_dir
+from mcm_store import get_conffile_ids_from_mcm
+from utils import get_from_deep_json
+
+
+def get_conffile_ids_all_chain_steps(dataset, mcm_dir):
+    """Return location of the configuration files for the dataset."""
+    ids = {}
+    path = mcm_dir + '/chain/' + dataset.replace('/', '@')
+    step_dirs = os.listdir(path)
+    for step in step_dirs:
+        step_dir = path + '/' + step
+        mcm_config_ids = get_conffile_ids_from_mcm(dataset, step_dir)
+
+        for someid in mcm_config_ids:
+                ids[someid] = 1
+
+    return list(ids.keys())
+
+
+def main(eos_dir,
+         mcm_dir,
+         conf_dir,
+         datasets,
+         ignore_eos_store):
+    "Do the job"
+
+    # only for the datasets with EOS file information
+    if ignore_eos_store:
+        eos_datasets = datasets.copy()
+    else:
+        eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir)
+
+    conffile_ids = []
+    for dataset_full_name in eos_datasets:
+        for conffile_id in get_conffile_ids_all_chain_steps(dataset_full_name, mcm_dir):
+            if conffile_id not in conffile_ids:
+                conffile_ids.append(conffile_id)
+
+    if not os.path.exists(conf_dir):
+        os.makedirs(conf_dir, exist_ok=True)
+
+    key_nodes = "~/.globus/userkey.nodes.pem"
+    cert = "~/.globus/usercert.pem"
+
+    total = len(conffile_ids)
+    i = 1
+    for conffile_id in conffile_ids:
+        filepath="{}/{}.configFile".format(conf_dir, conffile_id)
+        if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+            print("==> " + conffile_id + ".configFile\n==> Already exist. Skipping...")
+            i += 1
+            continue
+
+        print("Getting ({}/{}) {}/{}.configFile".format(i, total, conf_dir, conffile_id))
+
+        cmd = "curl -s -k --key {key} --cert {cert} https://cmsweb.cern.ch/couchdb/reqmgr_config_cache/{conffile_id}/configFile".format(conffile_id=conffile_id, key=key_nodes, cert=cert)
+        conffile = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        confs = conffile.stdout.decode("utf-8")
+        if confs:
+            with open(filepath, 'w') as outfile:
+                outfile.write(confs)
+        else:
+            print("[ERROR] Empty conf file for {ds}".format(ds=conffile_id), file=sys.stderr)
+
+        i += 1
diff --git a/cms-2016-simulated-datasets/code/das_json_store.py b/cms-2016-simulated-datasets/code/das_json_store.py
@@ -0,0 +1,130 @@
+import json
+import os
+import subprocess
+import sys
+import threading
+from time import sleep
+
+from eos_store import check_datasets_in_eos_dir
+from utils import get_dataset_name, get_from_deep_json
+
+
+def get_parent_dataset(dataset, das_dir):
+    "Return parent dataset to the given dataset or an empty string if no parent found. Not used for 2016"
+    parent_dataset = ''
+
+    filepath = das_dir + '/parent/' + dataset.replace('/', '@') + '.json'
+
+    if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+        parent_dataset = get_from_deep_json(get_das_store_json(dataset, 'parent', das_dir), 'parent_dataset')
+    return parent_dataset
+
+
+def get_das_store_json(dataset, query='dataset', das_dir=''):
+    "Return DAS JSON from the DAS JSON Store for the given dataset and given query."
+
+    if not dataset:
+        print('[ERROR] There is no DAS JSON store', query, 'for dataset', dataset,
+              file=sys.stderr)
+        return json.loads('{}')
+
+    filepath = das_dir + '/' + query + '/' + dataset.replace('/', '@') + '.json'
+    if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+        with open(filepath, 'r') as filestream:
+            return json.load(filestream)
+    else:
+        print('[ERROR] There is no DAS JSON store ' + query + ' for dataset ' + dataset,
+              file=sys.stderr)
+        return json.loads('{}')
+
+
+def mydasgoclient(dataset, query, out_dir, out_file):
+    "Interface to dasgoclient"
+
+    out = out_dir + '/' + query + '/' + out_file
+    if  os.path.exists(out) and os.stat(out).st_size != 0:
+        print('==> {:<9} {}'.format(query, dataset) +
+            '\n==> File already exist, skipping...\n')
+        return
+
+    print('\t{:<9} {}'.format(query, dataset))
+
+    cmd = 'dasgoclient -query "'
+    if query != "dataset":
+        cmd += query + ' '
+    cmd += 'dataset=' + dataset + '" -json'
+
+
+    das = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    if das.returncode == 16:  # ????
+        print("[Error] in ", cmd, file=sys.stderr)
+        print(das.stderr.decode("utf-8"), "\n", file=sys.stderr)
+    else:
+        das_out = das.stdout.decode("utf-8")
+        if das_out:
+            with open(out, 'w') as dasfile:
+                dasfile.write(das_out)
+        else:
+            print("[ERROR] Empty DAS {query} for {ds}".format(query=query, ds=dataset),
+                  file=sys.stderr)
+
+
+def create(dataset, das_dir):
+
+    result_file = dataset.replace('/', '@') + ".json"
+    mydasgoclient(dataset, "dataset", das_dir, result_file)
+    mydasgoclient(dataset, "config",  das_dir, result_file)
+    mydasgoclient(dataset, "release", das_dir, result_file)
+
+
+def main(das_dir,
+         eos_dir,
+         datasets,
+         ignore_eos_store):
+    "Do the job."
+
+    # create dirs for dataset and release
+    for path in [das_dir + '/dataset',
+                 das_dir + '/config',
+                 das_dir + '/release']:
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+    # only for the datasets with EOS file information
+    if ignore_eos_store:
+        eos_datasets = datasets.copy()
+    else:
+        eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir)
+
+    total = len(eos_datasets)
+    i = 1
+    for dataset in eos_datasets:
+        print("dasgoclienting ({}/{})".format(i, total), dataset)
+        t = threading.Thread(target=create, args=(dataset, das_dir))
+        t.start()
+        while threading.activeCount() >= 100 :
+            sleep(0.5)  # run 100 dasgoclient commands in parallel 
+        i += 1
+
+
+def get_generator_parameters(dataset, das_dir):
+    """Return generator parameters dictionary for given dataset. Not used in 2016"""
+    # TODO get from mcm store instead?
+    # and/or from xsecDB
+    out = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir),
+                             'generator_parameters')
+    if out:
+        return out[0]
+    else:
+        return {}
+
+
+def get_cmssw_version_from_das(dataset, das_dir):
+    """Return CMSSW release version from DAS JSON. Not used in 2016"""
+    out = get_from_deep_json(get_das_store_json(dataset, 'release', das_dir),
+                             'name')
+    if out:
+        return out[0]
+    else:
+        return {}