From bb7729519b3baf5746325628f9f08d189bbdaf1b Mon Sep 17 00:00:00 2001
From: Kati Lassila-Perini <kati.lassila-perini@cern.ch>
Date: Fri, 20 Oct 2023 16:48:40 +0200
Subject: [PATCH] cms-2016-simulated-datasets: work in progress

---
 cms-2016-simulated-datasets/README.md         |  75 +++
 .../code/conffiles_records.py                 |  83 +++
 .../code/config_store.py                      |  71 +++
 .../code/das_json_store.py                    | 130 +++++
 .../code/dataset_records.py                   | 529 ++++++++++++++++++
 cms-2016-simulated-datasets/code/interface.py |   6 +-
 cms-2016-simulated-datasets/code/mcm_store.py | 303 ++++++++++
 .../inputs/CMS-2016-mc-datasets.txt           |   3 +
 .../inputs/doi-sim.txt                        |   0
 .../inputs/recid_info.py                      |   5 +
 10 files changed, 1202 insertions(+), 3 deletions(-)
 create mode 100644 cms-2016-simulated-datasets/README.md
 create mode 100644 cms-2016-simulated-datasets/code/conffiles_records.py
 create mode 100644 cms-2016-simulated-datasets/code/config_store.py
 create mode 100644 cms-2016-simulated-datasets/code/das_json_store.py
 create mode 100644 cms-2016-simulated-datasets/code/dataset_records.py
 create mode 100644 cms-2016-simulated-datasets/code/mcm_store.py
 create mode 100644 cms-2016-simulated-datasets/inputs/CMS-2016-mc-datasets.txt
 create mode 100644 cms-2016-simulated-datasets/inputs/doi-sim.txt
 create mode 100644 cms-2016-simulated-datasets/inputs/recid_info.py

diff --git a/cms-2016-simulated-datasets/README.md b/cms-2016-simulated-datasets/README.md
new file mode 100644
index 000000000..13f1c5666
--- /dev/null
+++ b/cms-2016-simulated-datasets/README.md
@@ -0,0 +1,75 @@
+# cms-2016-simulated-datasets
+
+This directory contains helper scripts used to prepare CMS 2016 open data
+release regarding MC simulated datasets.
+
+
+- `code/` folder contains the python code.
+- `inputs/` folder contains input text files with the list of datasets for each
+  year and input files.
+
+Every step necessary to produce the final `*.json` files is handled by the
+`cmc-mc/interface.py` script. Details about it can be queried with the command:
+
+```console
+$ python3 code/interface.py --help
+```
+
+Make sure to start voms-proxy before creating cache 
+```console
+$ voms-proxy-init --voms cms --rfc --valid 190:00
+```
+
+Set the eos path with
+
+```console
+$ export EOS_MGM_URL=root://eospublic.cern.ch
+```
+
+Warning: creating the full local cache might take a long time!
+
+First step is to create EOS file index cache:
+
+```console
+$ python3 ./code/interface.py --create-eos-indexes ../cms-YYYY-simulated-datasets/inputs/CMS-2016-mc-datasets.txt
+```
+
+This requires the file to be in place in their final location.
+
+For early testing, on lxplus, all steps can be run without the EOS file index cache with the flag `--ignore-eos-store`.
+
+To build sample records (with a limited number of datasets in the input file) do the following:
+
+
+```console
+$ python3 ./code/interface.py --create-das-json-store --ignore-eos-store DATASET_LIST
+
+$ auth-get-sso-cookie -u  https://cms-pdmv.cern.ch/mcm -o cookies.txt
+$ python3 ./code/interface.py --create-mcm-json-store --ignore-eos-store DATASET_LIST
+
+$ openssl pkcs12 -in myCert.p12 -nocerts -nodes -out userkey.nodes.pem # if not present
+$ python3 ./code/interface.py --get-conf-files --ignore-eos-store DATASET_LIST
+
+$ python3 code/lhe_generators.py
+
+$ python3 ./code/interface.py --create-records --ignore-eos-store DATASET_LIST
+$ python3 ./code/interface.py --create-conffiles-records --ignore-eos-store DATASET_LIST
+```
+
+Note that to build the test records an (empty) input file for DOI's and a recid info file must be present in the inputs directory.
+Each step builds a subdirectory with a cache (`das-json-store`, `mcm-store` and `config-store`). They are large, do not upload them to the repository.
+
+The output json file for dataset records go to the `outputs` directory.
+
+
+## lhe_generators
+
+
+```console
+python3 code/lhe_generators.py 2> errors > output &
+```
+- This will get lhe generator parameters from gridpacks for datasets listed in `./inputs/CMS-2016-mc-datasets.txt`
+- It works on lxplus or with mounted EOS
+- number of threads is set to 20 which is ideal for lxplus
+
+> :warning:  There are many cases with various steps to get generator parameters for LHE -see [#97](https://github.com/cernopendata/data-curation/issues/97)-. Thus, in some few cases, the script MIGHT not work as expected so make sure to read it, check errors, and make any necessary tweaks 
diff --git a/cms-2016-simulated-datasets/code/conffiles_records.py b/cms-2016-simulated-datasets/code/conffiles_records.py
new file mode 100644
index 000000000..0253e1334
--- /dev/null
+++ b/cms-2016-simulated-datasets/code/conffiles_records.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+
+"""
+Create MC 2012 records.
+"""
+
+import hashlib
+import json
+import re
+import os
+import subprocess
+import sys
+from urllib.request import urlopen
+
+from utils import get_from_deep_json, \
+                  populate_doiinfo, \
+                  get_dataset_format, \
+                  get_dataset_year, \
+                  get_author_list_recid, \
+                  get_doi
+from das_json_store import get_das_store_json, \
+                           get_parent_dataset
+from eos_store import XROOTD_URI_BASE, \
+                      get_dataset_index_file_base, \
+                      get_dataset_location
+from mcm_store import get_mcm_dict, \
+                      get_global_tag, \
+                      get_genfragment_url, \
+                      get_generator_name, \
+                      get_dataset_energy, \
+                      get_cmsDriver_script
+from config_store import get_conffile_ids
+from categorisation import guess_title_category
+from dataset_records import get_dataset, \
+                            newer_dataset_version_exists
+
+
+def create_record(conf_id, conffiles_dir):
+    """Create record for the given dataset."""
+
+    rec = {}
+
+    with open(conffiles_dir + '/' + conf_id + '.configFile') as myfile:
+        #print(conf_id)
+        rec['cms_confdb_id'] = conf_id
+        rec['script'] = myfile.read()
+
+    return rec
+
+
+def create_records(conf_ids, conffiles_dir):
+    """Create records."""
+
+    records = []
+    for conf_id in conf_ids:
+        records.append(create_record(conf_id, conffiles_dir))
+    return records
+
+
+def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir):
+    "Do the job."
+
+    dataset_full_names = []
+    for dataset_full_name in datasets:
+        if newer_dataset_version_exists(dataset_full_name, datasets):
+            print('[ERROR] Ignoring older dataset version ' + dataset_full_name,
+                  file=sys.stderr)
+        else:
+            dataset_full_names.append(dataset_full_name)
+
+    conffiles = []
+    for ds in dataset_full_names:
+        #config_ids = get_conffile_ids(ds, das_dir, mcm_dir), using mcm for now, add the step loop, ds list is not enough
+        # get_conffile_ids in config_store has the loop inside, use that
+        config_ids = get_conffile_ids(ds, mcm_dir)
+        if config_ids:
+            for config_id in config_ids:
+                if config_id not in conffiles:
+                    conffiles.append(config_id)
+
+    records = create_records(conffiles, conffiles_dir)
+    json.dump(records, indent=2, sort_keys=True, ensure_ascii=True, fp=sys.stdout)
diff --git a/cms-2016-simulated-datasets/code/config_store.py b/cms-2016-simulated-datasets/code/config_store.py
new file mode 100644
index 000000000..671b161e0
--- /dev/null
+++ b/cms-2016-simulated-datasets/code/config_store.py
@@ -0,0 +1,71 @@
+import os
+import subprocess
+import sys
+
+from eos_store import check_datasets_in_eos_dir
+from mcm_store import get_conffile_ids_from_mcm
+from utils import get_from_deep_json
+
+
+def get_conffile_ids_all_chain_steps(dataset, mcm_dir):
+    """Return location of the configuration files for the dataset."""
+    ids = {}
+    path = mcm_dir + '/chain/' + dataset.replace('/', '@')
+    step_dirs = os.listdir(path)
+    for step in step_dirs:
+        step_dir = path + '/' + step
+        mcm_config_ids = get_conffile_ids_from_mcm(dataset, step_dir)
+       
+        for someid in mcm_config_ids:
+                ids[someid] = 1
+
+    return list(ids.keys())
+
+
+def main(eos_dir,
+         mcm_dir,
+         conf_dir,
+         datasets,
+         ignore_eos_store):
+    "Do the job"
+
+    # only for the datasets with EOS file information
+    if ignore_eos_store:
+        eos_datasets = datasets.copy()
+    else:
+        eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir)
+
+    conffile_ids = []
+    for dataset_full_name in eos_datasets:
+        for conffile_id in get_conffile_ids_all_chain_steps(dataset_full_name, mcm_dir):
+            if conffile_id not in conffile_ids:
+                conffile_ids.append(conffile_id)
+
+    if not os.path.exists(conf_dir):
+        os.makedirs(conf_dir, exist_ok=True)
+
+    key_nodes = "~/.globus/userkey.nodes.pem"
+    cert = "~/.globus/usercert.pem"
+
+    total = len(conffile_ids)
+    i = 1
+    for conffile_id in conffile_ids:
+        filepath="{}/{}.configFile".format(conf_dir, conffile_id)
+        if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+            print("==> " + conffile_id + ".configFile\n==> Already exist. Skipping...")
+            i += 1
+            continue
+
+        print("Getting ({}/{}) {}/{}.configFile".format(i, total, conf_dir, conffile_id))
+
+        cmd = "curl -s -k --key {key} --cert {cert} https://cmsweb.cern.ch/couchdb/reqmgr_config_cache/{conffile_id}/configFile".format(conffile_id=conffile_id, key=key_nodes, cert=cert)
+        conffile = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+        confs = conffile.stdout.decode("utf-8")
+        if confs:
+            with open(filepath, 'w') as outfile:
+                outfile.write(confs)
+        else:
+            print("[ERROR] Empty conf file for {ds}".format(ds=conffile_id), file=sys.stderr)
+
+        i += 1
diff --git a/cms-2016-simulated-datasets/code/das_json_store.py b/cms-2016-simulated-datasets/code/das_json_store.py
new file mode 100644
index 000000000..4c3508147
--- /dev/null
+++ b/cms-2016-simulated-datasets/code/das_json_store.py
@@ -0,0 +1,130 @@
+import json
+import os
+import subprocess
+import sys
+import threading
+from time import sleep
+
+from eos_store import check_datasets_in_eos_dir
+from utils import get_dataset_name, get_from_deep_json
+
+
+def get_parent_dataset(dataset, das_dir):
+    "Return parent dataset to the given dataset or an empty string if no parent found. Not used for 2016"
+    parent_dataset = ''
+
+    filepath = das_dir + '/parent/' + dataset.replace('/', '@') + '.json'
+
+    if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+        parent_dataset = get_from_deep_json(get_das_store_json(dataset, 'parent', das_dir), 'parent_dataset')
+    return parent_dataset
+
+
+def get_das_store_json(dataset, query='dataset', das_dir=''):
+    "Return DAS JSON from the DAS JSON Store for the given dataset and given query."
+
+    if not dataset:
+        print('[ERROR] There is no DAS JSON store', query, 'for dataset', dataset,
+              file=sys.stderr)
+        return json.loads('{}')
+
+    filepath = das_dir + '/' + query + '/' + dataset.replace('/', '@') + '.json'
+    if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+        with open(filepath, 'r') as filestream:
+            return json.load(filestream)
+    else:
+        print('[ERROR] There is no DAS JSON store ' + query + ' for dataset ' + dataset,
+              file=sys.stderr)
+        return json.loads('{}')
+
+
+def mydasgoclient(dataset, query, out_dir, out_file):
+    "Interface to dasgoclient"
+   
+    out = out_dir + '/' + query + '/' + out_file
+    if  os.path.exists(out) and os.stat(out).st_size != 0:
+        print('==> {:<9} {}'.format(query, dataset) +
+            '\n==> File already exist, skipping...\n')
+        return
+   
+    print('\t{:<9} {}'.format(query, dataset))
+
+    cmd = 'dasgoclient -query "'
+    if query != "dataset":
+        cmd += query + ' '
+    cmd += 'dataset=' + dataset + '" -json'
+
+
+    das = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    if das.returncode == 16:  # ????
+        print("[Error] in ", cmd, file=sys.stderr)
+        print(das.stderr.decode("utf-8"), "\n", file=sys.stderr)
+    else:
+        das_out = das.stdout.decode("utf-8")
+        if das_out:
+            with open(out, 'w') as dasfile:
+                dasfile.write(das_out)
+        else:
+            print("[ERROR] Empty DAS {query} for {ds}".format(query=query, ds=dataset),
+                  file=sys.stderr)
+
+
+def create(dataset, das_dir):
+
+    result_file = dataset.replace('/', '@') + ".json"
+    mydasgoclient(dataset, "dataset", das_dir, result_file)
+    mydasgoclient(dataset, "config",  das_dir, result_file)
+    mydasgoclient(dataset, "release", das_dir, result_file)
+
+
+def main(das_dir,
+         eos_dir,
+         datasets,
+         ignore_eos_store):
+    "Do the job."
+
+    # create dirs for dataset and release
+    for path in [das_dir + '/dataset',
+                 das_dir + '/config',
+                 das_dir + '/release']:
+        if not os.path.exists(path):
+            os.makedirs(path)
+
+    # only for the datasets with EOS file information
+    if ignore_eos_store:
+        eos_datasets = datasets.copy()
+    else:
+        eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir)
+
+    total = len(eos_datasets)
+    i = 1
+    for dataset in eos_datasets:
+        print("dasgoclienting ({}/{})".format(i, total), dataset)
+        t = threading.Thread(target=create, args=(dataset, das_dir))
+        t.start()
+        while threading.activeCount() >= 100 :
+            sleep(0.5)  # run 100 dasgoclient commands in parallel 
+        i += 1
+
+
+def get_generator_parameters(dataset, das_dir):
+    """Return generator parameters dictionary for given dataset. Not used in 2016"""
+    # TODO get from mcm store instead?
+    # and/or from xsecDB
+    out = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir),
+                             'generator_parameters')
+    if out:
+        return out[0]
+    else:
+        return {}
+
+
+def get_cmssw_version_from_das(dataset, das_dir):
+    """Return CMSSW release version from DAS JSON. Not used in 2016"""
+    out = get_from_deep_json(get_das_store_json(dataset, 'release', das_dir),
+                             'name')
+    if out:
+        return out[0]
+    else:
+        return {}
diff --git a/cms-2016-simulated-datasets/code/dataset_records.py b/cms-2016-simulated-datasets/code/dataset_records.py
new file mode 100644
index 000000000..8e4a12d36
--- /dev/null
+++ b/cms-2016-simulated-datasets/code/dataset_records.py
@@ -0,0 +1,529 @@
+#!/usr/bin/env python
+
+
+"""
+Create MC 2016 records.
+"""
+
+import hashlib
+import json
+import os
+import re
+import subprocess
+import sys
+import threading
+import zlib
+from datetime import datetime as dt
+from time import sleep
+
+import requests
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+
+from categorisation import guess_title_category
+from das_json_store import (get_cmssw_version_from_das, get_das_store_json,
+                            get_generator_parameters, get_parent_dataset)
+from eos_store import (XROOTD_URI_BASE, get_dataset_index_file_base,
+                       get_dataset_location)
+from mcm_store import (get_cmsDriver_script, get_cmssw_version_from_mcm,
+                       get_conffile_ids_from_mcm, get_dataset_energy,
+                       get_generator_name, get_generator_parameters_from_mcm,
+                       get_genfragment_url, get_global_tag, get_mcm_dict,
+                       get_parent_dataset_from_mcm, get_pileup_from_mcm,
+                       get_output_dataset_from_mcm)
+from utils import (get_author_list_recid, get_dataset_format, get_dataset_year,
+                   get_doi, get_from_deep_json,
+                   get_recommended_cmssw_for_analysis,
+                   get_recommended_global_tag_for_analysis, populate_doiinfo)
+
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+
+
+LINK_INFO = {}
+
+
+def get_number_events(dataset, das_dir):
+    """Return number of events for the dataset."""
+    number_events = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'nevents')
+    if number_events:
+        return number_events
+    return 0
+
+
+def get_number_files(dataset, das_dir):
+    """Return number of files for the dataset."""
+    number_files = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'nfiles')
+    if number_files:
+        return number_files
+    return 0
+
+
+def get_size(dataset, das_dir):
+    """Return size of the dataset."""
+    size = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'size')
+    if size:
+        return size
+    return 0
+
+
+def get_file_size(afile):
+    "Return file size of a file."
+    return os.path.getsize(afile)
+
+
+def get_file_checksum(afile):
+    """Return the ADLER32 checksum of a file."""
+    return zlib.adler32(open(afile, 'rb').read(), 1) & 0xffffffff
+
+
+def get_dataset(dataset_full_name):
+    "Return short dataset name from dataset full name."
+    return re.search(r'^/(.*?)/', dataset_full_name).groups()[0]
+
+
+def get_dataset_version(dataset_full_name):
+    "Return dataset version from dataset full name."
+    return re.search(r'^.*Summer12_DR53X-(.*)/AODSIM$', dataset_full_name).groups()[0]
+
+
+def get_dataset_index_files(dataset_full_name, eos_dir):
+    """Return list of dataset file information {path,size} for the given dataset."""
+    files = []
+    dataset_index_file_base = get_dataset_index_file_base(dataset_full_name)
+    output = subprocess.getoutput('ls ' + eos_dir + ' | grep ' + dataset_index_file_base)
+    for line in output.split('\n'):
+        afile = line.strip()
+        if afile.endswith('.txt') or afile.endswith('.json'):
+            # take only TXT files
+            afile_uri = XROOTD_URI_BASE + get_dataset_location(dataset_full_name) + '/file-indexes/' + afile
+            afile_size = get_file_size(eos_dir  + afile)
+            # FIXME CHANGE THIS checksum TO READ FROM SOMETHING SOMWEHERE
+            afile_checksum = '{:#010x}'.format(get_file_checksum(eos_dir + afile)).split('0x')[1]
+            files.append((afile_uri, afile_size, afile_checksum))
+    return files
+
+
+def newer_dataset_version_exists(dataset_full_name, all_datasets):
+    "Return whether the newer version of dataset exists."
+
+    dataset_full_name_until_version = dataset_full_name[0:dataset_full_name.rfind('-')]
+    similar_datasets = []
+
+    for dataset in all_datasets:
+        if dataset_full_name_until_version in dataset:
+            similar_datasets.append(dataset_full_name_until_version)
+    return len(similar_datasets) > 1
+
+
+def get_process(afile, conf_dir):
+    "Return suitable title of configuration file."
+    content = ''
+    try:
+        with open(conf_dir  + '/' + afile, 'r') as myfile:
+            content = myfile.read()
+    except FileNotFoundError:
+        print ( "Error: '" + conf_dir  + '/' + afile + "' No such file")
+    process = ''
+    m = re.search(r"process = cms.Process\(\s?['\"]([A-Z0-9]+)['\"]\s?(\)|,)", content)
+    if m:
+        process = m.groups(1)[0]
+    #if process == 'PAT':
+    #    process = "MINIAODSIM"
+    return process
+
+
+def get_cmssw_version(dataset, das_dir, mcm_dir):
+    """Return CMSSW version either from McM or from DAS."""
+    release = get_cmssw_version_from_mcm(dataset, mcm_dir)
+    if not release:
+        release = get_cmssw_version_from_das(dataset, das_dir)
+    return release
+
+
+def get_globaltag_from_conffile(afile, conf_dir):
+    "Return global tag information from the configuration file."
+    content = ''
+    try:
+        with open(conf_dir + '/' + afile, 'r') as myfile:
+            content = myfile.read()
+    except FileNotFoundError:
+        print ( "Error: '" + conf_dir  + '/' + afile + "' No such file")
+    globaltag = ''
+    m = re.search(r"globaltag = cms.string\(\s?['\"](.+)['\"]\s?\)", content)
+    if m:
+        globaltag = m.groups(1)[0]
+    else:
+        m = re.search(r"process.GlobalTag.globaltag\s?=\s?['\"](.+)['\"]", content)
+        if m:
+            globaltag = m.groups(1)[0]
+    return globaltag
+
+
+def get_all_generator_text(dataset, das_dir, mcm_dir, conf_dir, recid):
+    """Return DICT with all information about the generator steps."""
+
+    info = {}
+    info["description"] = "<p>These data were generated in several steps (see also <a href=\"/docs/cms-mc-production-overview\">CMS Monte Carlo production overview</a>):</p>"
+    info["steps"] = []
+    path = mcm_dir + '/chain/' + dataset.replace('/', '@')
+    step_dirs = os.listdir(path)
+    for step_dir in step_dirs:
+        mcm_step_dir = path + '/' + step_dir
+        input_dataset = dataset
+        step = {}
+        process = ''
+        output_dataset = get_output_dataset_from_mcm(dataset, mcm_step_dir)
+        if output_dataset:        
+            step['output_dataset'] = output_dataset
+        release = get_cmssw_version(dataset, das_dir, mcm_step_dir)
+        if release:
+            step['release'] = release
+        global_tag = get_global_tag(dataset, mcm_step_dir)
+        if global_tag:
+            step['global_tag'] = global_tag
+
+        cmsdriver_path = get_cmsDriver_script(dataset, mcm_step_dir)
+        step['configuration_files'] = []
+        if cmsdriver_path:
+            with open(cmsdriver_path) as myfile:
+                configuration_files = {}
+                configuration_files['title'] = 'Production script'
+                configuration_files['script'] = myfile.read()
+                if configuration_files:
+                    step['configuration_files'].append(configuration_files)
+
+        generator_names = get_generator_name(dataset, mcm_step_dir)
+        if generator_names:
+            step['generators'] = generator_names
+        
+        m = re.search('-(.+?)-', step_dir)
+        if m:
+            step_name = m.group(1)
+        if step_name.endswith('GEN'): # get generator parameters for LHEGEN and GEN datasets
+            step_generator_parameters= get_step_generator_parameters(dataset, mcm_step_dir, recid)
+            if step_generator_parameters:
+                step['configuration_files'].extend(step_generator_parameters)
+
+        config_ids = get_conffile_ids_from_mcm(dataset, mcm_step_dir)
+        if config_ids:
+            for config_id in config_ids:
+                afile = config_id + '.configFile'
+                proc = get_process(afile, conf_dir)
+                if process:
+                    process += " " + proc
+                else:
+                    process += proc
+                configuration_files = {}
+                configuration_files['title'] = 'Configuration file'
+                configuration_files['process'] = proc
+                configuration_files['cms_confdb_id'] = config_id
+                globaltag = get_globaltag_from_conffile(afile, conf_dir)
+                if not 'global_tag' in step:
+                    step['global_tag'] = globaltag
+
+                step['configuration_files'].append(configuration_files)
+
+        step['type'] = process
+
+        # Extend LHE steps 
+        if step_name.endswith('LHEGEN'):        
+            step['type'] = "LHE GEN"
+            for i, configuration_files in enumerate(step['configuration_files']):
+                if configuration_files['title'] == 'Generator parameters':
+                    step['configuration_files'][i]['title']='Hadronizer parameters'
+            # extend its LHE generator parameters
+            step_generator_parameters= get_step_generator_parameters(dataset, mcm_step_dir, recid, 1) # Force LHE
+            if step_generator_parameters:
+               step['configuration_files'].extend(step_generator_parameters)
+
+        info["steps"].append(step)
+
+    # reverse order of steps for provenance FIXME: order should be LHEGEN/GEN, SIM, DIGI2RAW, HLT, RECO, PAT, NANO
+    info['steps'].reverse()
+
+    # post-generation fix: if we have LHE step, let's modify the configuration file titles for other steps
+    # FIXME: is this now dublicate of the condition above?
+    lhe_present = False
+    for step in info['steps']:
+        if lhe_present:
+            for configuration_file in step.get('configuration_files'):
+                if configuration_file['title'] == 'Generator parameters':
+                    print("in the lhe_present condition with title Generator parameters not yet changed")
+                    configuration_file['title'] = 'Hadronizer parameters'
+        if 'LHE' in step['type']:
+            lhe_present = True
+
+    # post-generation fix: keep generators only for the first step, remove from others:
+    generators_present = False
+    for step in info['steps']:
+        if generators_present:
+            if 'generators' in step:
+                del(step['generators'])
+        else:
+            if 'generators' in step:
+                generators_present = True
+                    
+    return info
+
+
+def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir):
+    """Create record for the given dataset."""
+
+    rec = {}
+
+    dataset = get_dataset(dataset_full_name)
+    dataset_format = get_dataset_format(dataset_full_name)
+    year_created = '2016'
+    year_published = '2023'  # 
+    run_period = ['Run2016G', 'Run2016H']  #
+    global_tag = get_global_tag(dataset_full_name, mcm_dir)
+    release    = get_cmssw_version(dataset_full_name, das_dir, mcm_dir)
+
+    additional_title = 'Simulated dataset ' + dataset + ' in ' + dataset_format + ' format for ' + year_created + ' collision data'
+
+    rec['abstract'] = {}
+    rec['abstract']['description'] = '<p>' + additional_title + '.</p>' + \
+                                     '<p>See the description of the simulated dataset names in: <a href="/about/CMS-Simulated-Dataset-Names">About CMS simulated dataset names</a>.</p>' + \
+                                     '<p>These simulated datasets correspond to the collision data collected by the CMS experiment in ' + year_created + '.</p>'
+
+    rec['accelerator'] = "CERN-LHC"
+
+    rec['collaboration'] = {}
+    rec['collaboration']['name'] = 'CMS Collaboration'
+    rec['collaboration']['recid'] = get_author_list_recid(dataset_full_name)
+
+    rec['collections'] = ['CMS-Simulated-Datasets', ]
+
+    rec['collision_information'] = {}
+    rec['collision_information']['energy'] = get_dataset_energy(dataset_full_name, mcm_dir)
+    rec['collision_information']['type'] = 'pp'  # FIXME do not hardcode
+
+    # FIXME cross section will be read in separately
+    generator_parameters = get_generator_parameters_from_mcm(dataset_full_name, mcm_dir)
+    # if generator_parameters:
+    #    rec['cross_section'] = {}
+    #    rec['cross_section']['value'] = generator_parameters.get('cross_section', None)
+    #    rec['cross_section']['filter_efficiency:'] = generator_parameters.get('filter_efficiency', None)
+    #    rec['cross_section']['filter_efficiency_error:'] = generator_parameters.get('filter_efficiency_error', None)
+    #    rec['cross_section']['match_efficiency:'] = generator_parameters.get('match_efficiency', None)
+    #    rec['cross_section']['match_efficiency error:'] = generator_parameters.get('match_efficiency_error', None)
+
+    rec['date_created'] = [year_created]
+    rec['date_published'] = year_published
+    rec['date_reprocessed'] = year_created # FIXME, this is not correct
+         # for year_reprocessed: could use the year from "pdmv_submission_date": "220201", or "pdmv_monitor_time": "Sun Feb 06 13:24:33 2022", reqmgr_name etc in dict
+    rec['distribution'] = {}
+    rec['distribution']['formats'] = [dataset_format.lower(), 'root']
+    rec['distribution']['number_events'] = get_number_events(dataset_full_name, das_dir)
+    rec['distribution']['number_files'] = get_number_files(dataset_full_name, das_dir)
+    rec['distribution']['size'] = get_size(dataset_full_name, das_dir)
+
+    doi = get_doi(dataset_full_name, doi_info)
+    if doi:
+        rec['doi'] = doi
+
+    rec['experiment'] = 'CMS'
+
+    rec_files = get_dataset_index_files(dataset_full_name, eos_dir)
+    if rec_files:
+        rec['files'] = [] 
+        for index_type in ['.json', '.txt']:
+            index_files = [f for f in rec_files if f[0].endswith(index_type)]
+            for file_number, (file_uri, file_size, file_checksum) in enumerate(index_files):
+                rec['files'].append({
+                    'checksum': 'adler32:' + file_checksum,
+                    'description': dataset + dataset_format + ' dataset file index (' + str(file_number + 1) + ' of ' + str(len(index_files)) + ') for access to data via CMS virtual machine',
+
+                    'size': file_size,
+                    'type': 'index' + index_type,
+                    'uri': file_uri
+                })
+
+    rec['license'] = {}
+    rec['license']['attribution'] = 'CC0'
+
+    rec['methodology'] = get_all_generator_text(dataset_full_name, das_dir, mcm_dir, conffiles_dir, recid_info[dataset_full_name])
+
+
+    pileup_dataset_name= ''
+    pileup_dataset_name= get_pileup_from_mcm(dataset_full_name, mcm_dir)
+    
+    pileup_dataset_recid = {
+        '/MinBias_TuneZ2_7TeV-pythia6/Summer11Leg-START53_LV4-v1/GEN-SIM': 36, # 2011
+        '/MinBias_TuneZ2star_8TeV-pythia6/Summer12-START50_V13-v3/GEN-SIM': 37, # 2012
+        '/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-MCRUN2_71_V1-v2/GEN-SIM': 22314, # 2015
+        #'/MinBias_TuneCUETP8M1_13TeV-pythia8/RunIISummer15GS-magnetOffBS0T_MCRUN2_71_V1-v1/GEN-SIM': {recid}, # 2015 TODO
+        '/MinBias_TuneCP5_13TeV-pythia8/RunIIFall18GS-IdealGeometry_102X_upgrade2018_design_v9-v1/GEN-SIM': 12302 # 2018
+    }.get(pileup_dataset_name, 0)
+
+
+    if pileup_dataset_name:
+        rec['pileup'] = {}
+        if pileup_dataset_recid:
+            rec['pileup']['description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> are added to the simulated event in the DIGI2RAW step.</p>"
+            rec['pileup']['links'] = [ 
+                {
+                    "recid": str(pileup_dataset_recid),
+                    "title": pileup_dataset_name
+                }
+            ]
+        else:
+            rec['pileup']['description'] = "<p>To make these simulated data comparable with the collision data, <a href=\"/docs/cms-guide-pileup-simulation\">pile-up events</a> from the dataset <code>"\
+                                            + pileup_dataset_name\
+                                            + "</code> are added to the simulated event in the DIGI2RAW step.</p>"
+
+    rec['publisher'] = 'CERN Open Data Portal'
+
+    rec['recid'] = str(recid_info[dataset_full_name])
+
+    # rec['relations'] = []
+    # rec['relations']['title'] = ''  # FIXME, 2016 Nano are childs of 2016 Mini
+    # rec['relations']['type'] = 'isChildOf'
+
+    rec['run_period'] = run_period
+
+    # recomended global tag and cmssw release recommended for analysis
+    recommended_gt = get_recommended_global_tag_for_analysis(dataset_full_name)
+    recommended_cmssw = get_recommended_cmssw_for_analysis(dataset_full_name)
+    rec['system_details'] = {}
+    rec['system_details']['global_tag'] = "106X_dataRun2_v37"
+    rec['system_details']['release'] = "CMSSW_10_6_30" 
+
+    rec['title'] = dataset_full_name
+
+    rec['title_additional'] = additional_title
+
+    topic = guess_title_category(dataset_full_name)
+    category = topic.split('/')[0]
+    subcategory = None
+    if len(topic.split('/')) == 2:
+        subcategory = topic.split('/')[1]
+    rec['categories'] = {}
+    rec['categories']['primary'] = category
+    if subcategory:
+        rec['categories']['secondary'] = [subcategory]
+    rec['categories']['source'] = 'CMS Collaboration'
+
+    rec['type'] = {}
+    rec['type']['primary'] = 'Dataset'
+    rec['type']['secondary'] = ['Simulated', ]
+
+    year_getting_started = {'2010': 2010,
+                            '2011': 2011,
+                            '2012': 2011}.get(year_created, 2011)
+    rec['usage'] = {}
+    rec['usage']['description'] = "You can access these data through the CMS Open Data container or the CMS Virtual Machine. See the instructions for setting up one of the two alternative environments and getting started in" # FIXME
+    rec['usage']['links'] = [  # FIXME
+        {
+          "description": "Running CMS analysis code using Docker", 
+          "url": "/docs/cms-guide-docker"
+        }, 
+        {
+          "description": "How to install the CMS Virtual Machine", 
+          "url": "/docs/cms-virtual-machine-2016"
+        }, 
+        {
+          "description": "Getting started with CMS open data", 
+          "url": "/docs/cms-getting-started-2016"
+        }
+    ]
+
+    rec['validation'] = {}
+    rec['validation']['description'] = "The generation and simulation of Monte Carlo data has been validated through general CMS validation procedures."
+    #rec['validation']['links'] = 'FIXME'
+
+    return rec
+
+def create(dataset, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir):
+    filepath= records_dir  + "/" + dataset.replace('/', '@') + '.json'
+    if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+        print("==> " + dataset + "\n==> Already exist. Skipping...\n")
+        return
+        
+    Record= create_record(dataset, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir)
+
+    with open(filepath, 'w') as file:
+        json.dump(Record, indent=2, sort_keys=True, ensure_ascii=True, fp=file)
+
+
+
+def create_records(dataset_full_names, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir):
+    """Create records."""
+
+    recid_info = {}
+    _locals = locals()
+    exec(open(recid_file, 'r').read(), globals(), _locals)
+    recid_info = _locals['RECID_INFO']
+
+    doi_info = populate_doiinfo(doi_file)
+
+    records = []
+    for dataset_full_name in dataset_full_names:
+        #2016: comment out threading for debugging
+        t= threading.Thread(target=create, args=(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir))
+        t.start()
+        while threading.activeCount() >= 20 :
+            sleep(0.5)  # run 20 parallel 
+            
+        #records.append(create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir))
+    #return records
+
+
+def print_records(records):
+    """Print records."""
+    print('[')
+    for (idx, rec) in enumerate(records):
+        #print(json.dumps(rec, indent=2, sort_keys=True, ensure_ascii=True))
+        print(rec)
+        if idx == len(records) - 1:
+            pass
+        else:
+            print(',')
+    print(']')
+
+
+def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir, doi_file, recid_file):
+    "Do the job."
+
+    records_dir= "./outputs/records-" + dt.now().strftime("%Y-%m")
+    os.makedirs(records_dir, exist_ok=True)
+
+    create_records(datasets, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir)
+
+    #records = create_records(datasets, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir)
+    #json.dump(records, indent=2, sort_keys=True, ensure_ascii=True, fp=sys.stdout)
+
+
+def get_step_generator_parameters(dataset, mcm_dir, recid, force_lhe=0):
+    configuration_files = {}
+    if force_lhe:
+        mcdb_id= get_from_deep_json(get_mcm_dict(dataset,mcm_dir), "mcdb_id") or 0
+        if mcdb_id > 1:
+            print("Got mcdb > 1: " + str(mcdb_id))
+            configuration_files['title'] = 'Generator parameters'
+            configuration_files['url'] = "/eos/opendata/cms/lhe_generators/2015-sim/mcdb/{mcdb_id}_header.txt".format(mcdb_id=mcdb_id)    
+            return [configuration_files]        
+        else:       
+            dir='./lhe_generators/2016-sim/gridpacks/' + str(recid) + '/' 
+            files = []
+            files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
+            confarr=[]
+            for f in files:
+                configuration_files['title'] = 'Generator parameters: ' + f
+                configuration_files['url'] = '/eos/opendata/cms/lhe_generators/2016-sim/gridpacks/' + str(recid) + '/'  + f
+                confarr.append(configuration_files.copy())
+            return confarr
+    else:
+        gen_fragment = get_genfragment_url(dataset, mcm_dir)
+        if gen_fragment:
+            for url in gen_fragment:
+                configuration_files = {}
+                configuration_files['title'] = 'Generator parameters'
+                configuration_files['url'] = url
+                try:
+                    script = requests.get(url, verify=False).text
+                    configuration_files['script'] = script
+                    if configuration_files:
+                        return [configuration_files]
+                except:
+                    pass
+
diff --git a/cms-2016-simulated-datasets/code/interface.py b/cms-2016-simulated-datasets/code/interface.py
index c15a9f657..c4809314b 100644
--- a/cms-2016-simulated-datasets/code/interface.py
+++ b/cms-2016-simulated-datasets/code/interface.py
@@ -159,13 +159,13 @@ def main(dataset_list,
 
     if create_mcm_store:
         import mcm_store
-        mcm_store.create(datasets, mcm_dir, das_dir, eos_dir, ignore_eos_store)
+        mcm_store.create(datasets, mcm_dir, eos_dir, ignore_eos_store)
 
     if get_conf_files:
         # check if user has key and cert
         if os.path.isfile(os.environ.get("HOME") + "/.globus/usercert.pem") and os.path.isfile(os.environ.get("HOME") + "/.globus/userkey.nodes.pem"):
             import config_store
-            config_store.main(eos_dir, das_dir, conf_dir, datasets, ignore_eos_store)
+            config_store.main(eos_dir, mcm_dir, conf_dir, datasets, ignore_eos_store)
         else:
             print("Error in key and certificate pairs (~/.globus/usercert.pem, ~/.globus/userkey.nodes.pem).")
             print('Did you forget to ')
@@ -174,8 +174,8 @@ def main(dataset_list,
             print('in the ~/.globus dir?')
 
     if print_categorisation or print_results:
-        import printer
         import categorisation
+        import printer
 
         categorised = categorisation.categorise_titles(datasets)
         printer.print_results(categorised, das_dir, mcm_dir, recid_file, doi_file, print_results)
diff --git a/cms-2016-simulated-datasets/code/mcm_store.py b/cms-2016-simulated-datasets/code/mcm_store.py
new file mode 100644
index 000000000..a920bb983
--- /dev/null
+++ b/cms-2016-simulated-datasets/code/mcm_store.py
@@ -0,0 +1,303 @@
+import json
+import os
+import re
+import subprocess
+import sys
+import threading
+from time import sleep
+
+from das_json_store import get_das_store_json, get_parent_dataset
+from eos_store import check_datasets_in_eos_dir
+from utils import get_dataset_format, get_dataset_year, get_from_deep_json
+
+
+def mcm_downloader(dataset, mcm_dir):
+    "Query dictionary and setup script from McM database"
+
+    filepath = mcm_dir + "/dict/" + dataset.replace('/', '@') + ".json"
+    if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+        print("==> " + dataset + "\n==> Already exist. Skipping...")
+        return
+
+    cmd = "curl -s -k https://cms-pdmv.cern.ch/mcm/public/restapi/requests/"
+
+    mcm_dict = subprocess.run(cmd + "produces" + dataset,
+                              shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    mcm_dict_out = str(mcm_dict.stdout.decode("utf-8"))
+    prepid = None
+    if mcm_dict_out != '{"results": {}}\n' or mcm_dict_out != '{"results": {}}':
+        # get prepid from mcm/dataset
+        prepid= get_from_deep_json(json.loads(mcm_dict_out), "prepid")
+
+    if prepid == None:
+        print("Error: prepid not found in mcm for " + dataset + "\n==> Skipping dataset McM dict and script",file=sys.stderr )
+        return
+
+    outfile = mcm_dir + "/dict/" + dataset.replace('/', '@') + ".json"
+    with open(outfile, 'w') as dict_file:
+           dict_file.write(mcm_dict_out)
+
+    mcm_script = subprocess.run(cmd + "get_test/" + prepid ,
+                                shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    mcm_script_out = str(mcm_script.stdout.decode("utf-8"))
+
+    if mcm_script_out == '' or mcm_script_out[0] == '{':
+        print("[ERROR] Empty McM script (get_test) for {ds}".format(ds=dataset),
+            file=sys.stderr)
+    else:
+        outfile = mcm_dir + "/scripts/" + dataset.replace('/', '@') + ".sh"
+        with open(outfile, 'w') as dict_file:
+                dict_file.write(mcm_script_out)
+
+    ### New 2016
+    # create a directory with the dataset name under mcm_dir + "/chain"
+    # create dirs
+    path = mcm_dir + "/chain/" + dataset.replace('/', '@')
+    os.makedirs(path, exist_ok=True)
+
+    # command line: curl -s -k https://cms-pdmv.cern.ch/mcm/public/restapi/requests/get/<prepid> | jq .results.member_of_chain
+    # FIXME: change shell jq to deep json query
+    mcm_chain_prepid = subprocess.run(cmd + "get/" + prepid  + " | jq .results.member_of_chain",
+                                shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    mcm_chain_prepid_out = str(mcm_chain_prepid.stdout.decode("utf-8"))
+    chain_prepid = re.sub(r"[\[\]]", "", mcm_chain_prepid_out).strip()
+
+    # now query for the full chain
+    # commands line: curl -L -s -b cookies.txt https://cms-pdmv.cern.ch/mcm/restapi/chained_requests/get/<chain_prepid> | jq .results.chain
+    # FIXME: change shell jq to deep json query
+    # REQUIRES: run on command line first: auth-get-sso-cookie -u  https://cms-pdmv.cern.ch/mcm -o cookies.txt
+    chaincmd = "curl -L -s -b cookies.txt https://cms-pdmv.cern.ch/mcm/restapi/chained_requests/"
+    mcm_chain_prepids = subprocess.run(chaincmd + "get/" + chain_prepid  + " | jq .results.chain",
+                                shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    mcm_chain_prepids_out = str(mcm_chain_prepids.stdout.decode("utf-8"))
+    json_object = json.loads(mcm_chain_prepids_out)
+    for step in json_object:
+        step_path = mcm_dir + "/chain/" + dataset.replace('/', '@') + "/" + step
+        for path in [step_path, step_path + "/dict", step_path + "/scripts"]:
+            os.makedirs(path, exist_ok=True)
+        
+        mcm_step_dict = subprocess.run(cmd + "get/" + step,
+                                shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        mcm_step_dict_out = str(mcm_step_dict.stdout.decode("utf-8"))
+
+        # check if it is empty (then there is no way to get dataset McM dict)
+        if mcm_step_dict_out == '{"results": {}}\n' or mcm_step_dict_out == '{"results": {}}':
+            print(step + "[ERROR] Empty McM dict (get) for {ds} \n with prepid {pd}".format(ds=dataset,pd=prepid),
+                  file=sys.stderr)
+        else:
+            outfile = step_path + "/dict/" + dataset.replace('/', '@') + ".json"
+            with open(outfile, 'w') as dict_file:
+                    dict_file.write(mcm_step_dict_out)
+
+        mcm_step_script = subprocess.run(cmd + "get_test/" + step ,
+                                shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        mcm_step_script_out = str(mcm_step_script.stdout.decode("utf-8"))
+
+        if mcm_step_script_out == '' or mcm_step_script_out[0] == '{':
+            print(step + " [ERROR] Empty McM script (get_test) for {ds}".format(ds=dataset),
+                file=sys.stderr)
+        else:
+            outfile = step_path + "/scripts/" + dataset.replace('/', '@') + ".sh"
+            with open(outfile, 'w') as dict_file:
+                    dict_file.write(mcm_step_script_out)
+
+    ### End new
+
+
+def create(datasets, mcm_dir, eos_dir, ignore_eos_store=False):
+    "Get information from McM about each dataset"
+
+    # create dirs
+    for path in [mcm_dir + "/dict", mcm_dir + "/scripts", mcm_dir + "/chain"]:
+        os.makedirs(path, exist_ok=True)
+
+    # only for datasets with EOS file information
+    if ignore_eos_store:
+        eos_datasets = datasets.copy()
+    else:
+        eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir)
+
+    total = len(eos_datasets)
+    i = 1
+    for dataset in eos_datasets:
+        print("McM Storing ({i}/{N}) {ds}".format(i=i, N=total, ds=dataset))
+        t = threading.Thread(target=mcm_downloader, args=(dataset, mcm_dir))
+        t.start()
+        while threading.activeCount() >= 100 :
+            sleep(0.5)  # run 100 curl commands in parallel
+        i += 1
+
+
+def get_mcm_dict(dataset, mcm_dir):
+    """Return cached McM dictionary of dataset in json format"""
+
+    filepath = mcm_dir + '/dict/' + dataset.replace('/', '@') + '.json'
+    if os.path.exists(filepath) and os.stat(filepath).st_size != 0:
+        with open(filepath, 'r') as filestream:
+            return json.load(filestream)
+    else:
+        print('[ERROR] There is no McM JSON store dict for dataset ' + dataset,
+              file=sys.stderr)
+        return json.loads('{}')
+
+
+def get_prepId_from_das(dataset, das_dir):
+    "get prepid for dataset"
+
+    # get prepid from das/dataset
+    prepid = get_from_deep_json(get_das_store_json(dataset, 'dataset', das_dir), 'prep_id')
+
+    if prepid == None:
+        # try to get from das/mcm:
+        prepid = get_from_deep_json(get_das_store_json(dataset, 'mcm', das_dir), 'prepid')
+        # todo also try different queries from the json. prep_id?
+
+    return prepid
+
+
+def get_prepid_from_mcm(dataset, mcm_dir):
+    "get prepid for dataset from McM store. Not used in 2016"
+
+    # get prepid from das/dataset
+    prepid = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), 'prep_id')
+
+    if prepid == None:
+        # try different queries from the json. prep_id?
+        prepid = get_from_deep_json(get_mcm_dict(dataset, mcm_dir), 'prepid')
+
+    return prepid
+
+
+def get_global_tag(dataset, mcm_dir):
+    "Get global tag from McM dictionary"
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    global_tag = get_from_deep_json(mcm_dict, 'conditions')
+
+    if not global_tag:
+        global_tag = ''
+
+    return global_tag
+
+
+def get_cmssw_version_from_mcm(dataset, mcm_dir):
+    "Get CMSSW version from McM dictionary"
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    cmssw = get_from_deep_json(mcm_dict, 'cmssw_release')
+
+    if not cmssw:
+        cmssw = ''
+
+    return cmssw
+
+
+def get_cmsDriver_script(dataset, mcm_dir):
+    """Return path to cmsDriver script for that dataset"""
+    if dataset == None:
+        return None
+
+    script = mcm_dir + '/scripts/' + dataset.replace('/', '@') + '.sh'
+    if os.path.exists(script):
+        return script
+    else:
+        return None
+
+
+def get_genfragment_url(dataset, mcm_dir):
+    "return list of url's of the genfragments used"
+    url = []
+    script_path = get_cmsDriver_script(dataset, mcm_dir)
+    if script_path == None:
+        return None
+
+    with open(script_path, 'r') as script:
+        for line in script:
+            if 'curl' in line:
+                curl = re.search('(?P<url>https?://[^\s]+)', line)
+                if curl:
+                    url.append(curl.group('url'))
+    return url
+
+
+def get_dataset_energy(dataset, mcm_dir):
+    "Return energy of that dataset in TeV"
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    if mcm_dict:
+        energy = get_from_deep_json(mcm_dict, 'energy')
+        if isinstance(energy, str):
+            return energy
+        else:
+            return str(energy).replace('.0', '') + 'TeV'
+
+    else:
+        year = get_dataset_year(dataset)
+        return {
+               2010:  '7TeV',
+               2011:  '7TeV',
+               2012:  '8TeV',
+               2015: '13TeV',
+               2016: '13TeV',
+               }.get(year, 0)
+
+
+def get_generator_name(dataset, mcm_dir):
+    "Return list of generators used for that dataset"
+    generator_names = []
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    generators = get_from_deep_json(mcm_dict, 'generators')
+
+    if generators:
+        for item in generators:
+            for char in ['"', '\\', '[', ']']:  # remove ", \, [, ]
+                item = item.replace(char, '')
+            generator = item
+            if generator not in generator_names:
+                generator_names.append(item)
+
+    return generator_names
+
+
+def get_parent_dataset_from_mcm(dataset, mcm_dir):
+    "Return parent dataset to given DATASET from McM."
+    parent_dataset = ''
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    parent_dataset = get_from_deep_json(mcm_dict, 'input_dataset')
+    return parent_dataset
+
+def get_output_dataset_from_mcm(dataset, mcm_dir):
+    "Return output dataset to given production step of a DATASET from McM."
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    output_dataset = get_from_deep_json(mcm_dict, 'output_dataset')
+    return output_dataset
+
+def get_conffile_ids_from_mcm(dataset, mcm_dir):
+    """Return location of the configuration files for the dataset from McM."""
+    config_ids = []
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    config_ids = get_from_deep_json(mcm_dict, 'config_id')
+    return config_ids
+
+
+def get_generator_parameters_from_mcm(dataset, mcm_dir):
+    """Return generator parameters dictionary for given dataset."""
+    mcm_dict = get_mcm_dict(dataset, mcm_dir)
+    out = get_from_deep_json(mcm_dict, 'generator_parameters')
+    if out:
+        return out[0]
+    else:
+        return {}
+
+
+def get_pileup_from_mcm(dataset, mcm_dir):
+    """Return pileup_dataset_name from the DIGIPremix step of a given dataset."""
+    pileup=''
+    path = mcm_dir + '/chain/' + dataset.replace('/', '@')
+    step_dirs = os.listdir(path)
+    for step_dir in step_dirs:
+        m = re.search('-(.+?)-', step_dir)
+        if m:
+            step_name = m.group(1)
+            if step_name.endswith('DIGIPremix'):
+                mcm_step_dir = path + '/' + step_dir
+                mcm_dict = get_mcm_dict(dataset, mcm_step_dir)
+                pileup = get_from_deep_json(mcm_dict, 'pileup_dataset_name')
+    return pileup
diff --git a/cms-2016-simulated-datasets/inputs/CMS-2016-mc-datasets.txt b/cms-2016-simulated-datasets/inputs/CMS-2016-mc-datasets.txt
new file mode 100644
index 000000000..f001f60c8
--- /dev/null
+++ b/cms-2016-simulated-datasets/inputs/CMS-2016-mc-datasets.txt
@@ -0,0 +1,3 @@
+/ADDmonoPhoton_MD-1_d-3_TuneCP5_13TeV-pythia8/RunIISummer20UL16NanoAODv9-106X_mcRun2_asymptotic_v17-v2/NANOAODSIM
+/BBH_HToJPsiG_JPsiToMuMu_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL16MiniAODv2-106X_mcRun2_asymptotic_v17-v1/MINIAODSIM
+/WminusJetsToTauNu_TauToMu_TuneCP5_13TeV-powhegMiNNLO-pythia8-photos/RunIISummer20UL16MiniAODv2-106X_mcRun2_asymptotic_v17-v1/MINIAODSIM
diff --git a/cms-2016-simulated-datasets/inputs/doi-sim.txt b/cms-2016-simulated-datasets/inputs/doi-sim.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/cms-2016-simulated-datasets/inputs/recid_info.py b/cms-2016-simulated-datasets/inputs/recid_info.py
new file mode 100644
index 000000000..d9337cb41
--- /dev/null
+++ b/cms-2016-simulated-datasets/inputs/recid_info.py
@@ -0,0 +1,5 @@
+RECID_INFO ={
+"/ADDmonoPhoton_MD-1_d-3_TuneCP5_13TeV-pythia8/RunIISummer20UL16NanoAODv9-106X_mcRun2_asymptotic_v17-v2/NANOAODSIM": 30000,
+"/BBH_HToJPsiG_JPsiToMuMu_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL16MiniAODv2-106X_mcRun2_asymptotic_v17-v1/MINIAODSIM": 30001,
+"/WminusJetsToTauNu_TauToMu_TuneCP5_13TeV-powhegMiNNLO-pythia8-photos/RunIISummer20UL16MiniAODv2-106X_mcRun2_asymptotic_v17-v1/MINIAODSIM": 30002
+}