diff --git a/src/cli.js b/src/cli.js index adba86f..441f53a 100755 --- a/src/cli.js +++ b/src/cli.js @@ -8,7 +8,8 @@ import { newDraft } from './drafting/new-draft.js'; import { enrich } from './enrichment/enrich.js'; import { finalize } from './finalizing/finalize.js'; import { list } from './list.js'; -import { migrateLandmarks } from './migration/ccf-landmarks/migrate.js'; +import { migrateCcfLandmarks } from './migration/ccf-landmarks/migrate.js'; +import { migrateCcfReleases } from './migration/ccf-releases/migrate.js'; import { normalize } from './normalization/normalize.js'; import { getContext, getProcessorVersion, parseDirectory } from './utils/context.js'; import { error } from './utils/logging.js'; @@ -100,9 +101,17 @@ program program .command('migrate-ccf-landmarks') - .description('Migrate ccf landmarks to HRA Digital Object') + .description('Migrate ccf landmarks to HRA Digital Object format') .action((_options, command) => { - migrateLandmarks(getContext(program, command)); + migrateCcfLandmarks(getContext(program, command)); + }); + +program + .command('migrate-ccf-releases') + .description('Migrate ccf releases to HRA Digital Object format') + .argument('', 'Path to the ccf-releases repository checked out locally', parseDirectory) + .action((ccfReleasesPath, _options, command) => { + migrateCcfReleases({ ...getContext(program, command), ccfReleasesPath }); }); program diff --git a/src/migration/ccf-landmarks/migrate.js b/src/migration/ccf-landmarks/migrate.js index 597adad..0cc888b 100644 --- a/src/migration/ccf-landmarks/migrate.js +++ b/src/migration/ccf-landmarks/migrate.js @@ -10,7 +10,7 @@ import { getLandmarkMetadata } from './renaming.js'; const CROSSWALK_HEADER = ['extraction_set_for', 'extraction_set_id', 'extraction_set_label', 'node_name', 'label']; const SOURCE_DATA_URL = 'https://raw.githubusercontent.com/hubmapconsortium/hubmap-ontology/master/source_data'; -export async function migrateLandmarks(context) { +export async function migrateCcfLandmarks(context) { const extractionSiteUrls = (await fetchCsv(`${SOURCE_DATA_URL}/extraction-site-config.csv`)).map((row) => row.object); const fullCrosswalk = await fetchCsv(`${SOURCE_DATA_URL}/asct-b-3d-models-landmarks.csv`, 10); diff --git a/src/migration/ccf-releases/2d-ftu-lookup.csv b/src/migration/ccf-releases/2d-ftu-lookup.csv new file mode 100644 index 0000000..c1584f4 --- /dev/null +++ b/src/migration/ccf-releases/2d-ftu-lookup.csv @@ -0,0 +1,26 @@ +id,representation_of,label,old_id,organ_label,organ_id +kidney-ascending-thin-loop-of-henle,UBERON:0004193,loop of Henle ascending limb thin segment,#FTUAscendingThinLimb +kidney-cortical-collecting-duct,UBERON:0004203,Cortical Collecting Duct,#FTUCorticalCollectingDuct +kidney-descending-thin-loop-of-henle,UBERON:0001289,descending limb of loop of Henle,#FTUDescendingThinLimb +kidney-inner-medullary-collecting-duct,UBERON:0004205,inner medullary collecting duct,#FTUInnerMedullaryCollectingDuct +kidney-nephron,UBERON:0001285,nephron,#FTUNephron,Kidney,UBERON:0002113 +kidney-outer-medullary-collecting-duct,UBERON:0004204,outer medullary collecting duct,#FTUOuterMedullaryCollectingDuct +kidney-renal-corpuscle,UBERON:0001229,renal corpuscle,#FTURenalCorpuscle,Kidney,UBERON:0002113 +kidney-thick-ascending-loop-of-henle,UBERON:0001291,thick ascending limb of loop of Henle,#FTUThickAscendingLimb +large-intestine-crypt-lieberkuhn,UBERON:0001984,crypt of Lieberkuhn of large intestine,#FTUCryptOfLieberkuhn,Large Intestine,UBERON:0000059 +liver-liver-lobule,UBERON:0004647,liver lobule,#FTULiverLobule_inset1,Liver,UBERON:0002107 +liver-liver-lobule,UBERON:0004647,liver lobule,#FTULiverLobule_inset2,Liver,UBERON:0002107 +lung-bronchial-submucosal-gland,UBERON_8410043,bronchus submucosal gland,#FTUBronchialSubmucosalGland +lung-pulmonary-alveolus,UBERON:0002299,alveolus of lung,#FTUAlveoli,Lung,UBERON:0002048 +pancreas-intercalated-duct,UBERON:0014726,intercalated duct of pancreas,#FTUIntercalatedDuct +pancreas-islets-langerhans,UBERON:0000006,islet of Langerhans,#FTUIsletOfLangerhans,Pancreas,UBERON:0001264 +pancreas-pancreatic-acinus,UBERON:0001263,pancreatic acinus,#FTUAcinus +prostate-prostate-glandular-acinus,UBERON:0004179,prostate glandular acinus,#FTUProstateGlandularAcinus,Prostate Gland,UBERON:0002367 +skin-dermal-papilla,UBERON:0001992,papillary layer of dermis,#FTUDermalPapilla +skin-epidermal-ridge,UBERON:0013487,epidermal ridge of digit,#FTUEpidermalRidge +spleen-red-pulp,UBERON:0001250,red pulp of spleen,#FTURedPulp_Inset1 +spleen-red-pulp,UBERON:0001250,red pulp of spleen,#FTURedPulp_Inset2 +spleen-white-pulp,UBERON:0001959,white pulp of spleen,#FTUWhitePulp_Inset1 +spleen-white-pulp,UBERON:0001959,white pulp of spleen,#FTUWhitePulp_Inset2 +thymus-thymus-lobule,UBERON:0002125,thymus lobule,#FTUThymusLobule_Inset1,Thymus,UBERON:0002370 +thymus-thymus-lobule,UBERON:0002125,thymus lobule,#FTUThymusLobule_Inset2,Thymus,UBERON:0002370 diff --git a/src/migration/ccf-releases/hra-metadata.yaml b/src/migration/ccf-releases/hra-metadata.yaml new file mode 100644 index 0000000..be1fba2 --- /dev/null +++ b/src/migration/ccf-releases/hra-metadata.yaml @@ -0,0 +1,38 @@ +title: Human Reference Atlas (HRA) +description: 'Human Reference Atlas (HRA) ' +creators: + - fullName: Katy Börner + firstName: Katy + lastName: Börner + orcid: 0000-0002-3321-6137 +project_leads: + - fullName: Katy Börner + firstName: Katy + lastName: Börner + orcid: 0000-0002-3321-6137 +reviewers: + - fullName: Ellen M. Quardokus + firstName: Ellen + lastName: Quardokus + orcid: 0000-0001-7655-4833 +externalReviewers: [] +creation_date: '2022-05-06' +license: >- + Creative Commons Attribution 4.0 International ([CC BY + 4.0](https://creativecommons.org/licenses/by/4.0/)) +publisher: HuBMAP +funders: + - funder: National Institutes of Health + awardNumber: OT2OD026671 +hubmapId: HBM248.CBJV.556 +doi: https://doi.org/10.48539/HBM248.CBJV.556 +citation: >- + Sanjay Jain; M. Todd Valerius; Yongqun He, HuBMAP ASCT+B Tables. Kidney v1.2 + [https://doi.org/10.48539/HBM248.CBJV.556](https://doi.org/10.48539/HBM248.CBJV.556) +citationOverall: >- + Quardokus, Ellen, Bruce W. Herr II, Lisel Record, Katy Börner. 2022. [*HuBMAP + ASCT+B + Tables*](https://hubmapconsortium.github.io/ccf/pages/ccf-anatomical-structures.html). + Accessed on May 6, 2022. +datatable: + - digital-objects.yaml diff --git a/src/migration/ccf-releases/md-parser.js b/src/migration/ccf-releases/md-parser.js new file mode 100644 index 0000000..b17af29 --- /dev/null +++ b/src/migration/ccf-releases/md-parser.js @@ -0,0 +1,179 @@ +import { readFileSync } from 'fs'; +import { basename } from 'path'; + +const NAME_REMAPPING = { + 'asctb-3d-models-crosswalk': 'asct-b-3d-models-crosswalk', + 'asctb-crosswalk': 'asct-b-2d-models-crosswalk', + 'bone-marrow-pelvis': 'bonemarrow-pelvis', + 'intestine-large': 'large-intestine', + 'ln-ibex': '1-human-lymph-node-ibex', + 'lymph-node-ibex': '1-human-lymph-node-ibex', + 'intestines-codex': '2-intestine-codex', + 'kidney-codex': '3-kidney-codex', + 'skin-celldive': '4-skin-cell-dive', + 'liver-sim': '5-liver-sims', + 'pancreas-codex': '6-pancreas-codex', + 'lung-celldive': '7-lung-cell-dive', + 'intestine-large-male': 'large-intestine-male', + 'intestine-large-female': 'large-intestine-female', + 'vasculature-male': 'blood-vasculature-male', + 'vasculature-female': 'blood-vasculature-female', + vasculature: 'blood-vasculature', + brain: 'allen-brain', + 'bone-marrow-and-blood': 'bonemarrow-pelvis', +}; + +export class HraMarkdownParser { + constructor(inputFile) { + this.inputFile = inputFile; + this.rawMd = readFileSync(inputFile) + .toString() + .replace(/\ö\;/g, 'ö') + .trim() + .split('\n'); + } + + hasKey(key) { + return !!this.rawMd.find((l) => l.includes(`**${key}:**`)); + } + getMetadata(key) { + if (!this.hasKey(key)) { + return ''; + } + return this.rawMd + .find((l) => l.includes(`**${key}:**`)) + .split('|')[2] + .trim(); + } + getMultiValue(key) { + return this.getMetadata(key) + .replace('ö', 'ö') + .split(/[\;\,]\ */g) + .map((n) => n.trim()); + } + getAccessedDate(dateStr) { + const [_dayOfWeek, month, day, year] = new Date(dateStr).toDateString().split(' '); + return `${month} ${parseInt(day, 10)}, ${year}`; + } + getAuthors(nameKey, orcidKey) { + if (!this.hasKey(nameKey) || !this.hasKey(orcidKey)) { + return []; + } + const names = this.getMultiValue(nameKey); + const orcids = this.getMultiValue(orcidKey).map((n) => n.slice(n.indexOf('[') + 1, n.indexOf(']')).trim()); + return names.map((fullName, index) => ({ + fullName, + firstName: fullName.split(/\ +/g).slice(0)[0], + lastName: fullName.replace(/\ II$/g, '').split(/\ +/g).slice(-1)[0], + orcid: orcids[index], + })); + } + getFunders(funderKey, awardKey) { + const funders = this.getMultiValue(funderKey); + const awards = this.getMultiValue(awardKey); + + return funders.map((funder, index) => ({ + funder, + awardNumber: awards[index], + })); + } + + getName() { + let name = basename(this.inputFile, '.md') + .replace(this.getDoType() + '-', '') + .replace(/^3d\-/, '') + .replace(/^vh\-/, '') + .replace(/^f-/, 'female-') + .replace(/^m-/, 'male-') + .replace(/-l$/, '-left') + .replace(/-r$/, '-right') + .replace(/-mapping$/, '-crosswalk'); + + let sex; + if (name.includes('female')) { + sex = 'female'; + } else if (name.includes('male')) { + sex = 'male'; + } + if (sex) { + const hasLaterality = name.endsWith('-left') || name.endsWith('-right'); + const elts = name.split('-').filter((s) => s !== sex); + + // Format for reference organs = ${organ}-${sex}-${laterality "optional"} + if (hasLaterality) { + name = `${elts.slice(0, -1).join('-')}-${sex}-${elts.slice(-1).join('-')}`; + } else { + name = `${elts.join('-')}-${sex}`; + } + } + + name = NAME_REMAPPING[name] || name; + + return name; + } + getTitle() { + return this.rawMd[0].slice(1).trim().split(' ').slice(0, -1).join(' ').trim().replace(/,$/, ''); + } + getVersion() { + return this.rawMd[0].slice(1).trim().split(' ').slice(-1)[0]; + } + getDescription() { + return this.rawMd[this.rawMd.findIndex((n) => n.startsWith('### Description')) + 1].trim(); + } + getHowToCiteKey() { + return this.rawMd + .find((l) => l.includes('**How to Cite') && !l.includes('Overall:**')) + .split('|')[1] + .trim() + .replace(/\*/g, '') + .replace(/\:/g, ''); + } + getHowToCiteOverallKey() { + return this.rawMd + .find((l) => l.includes('**How to Cite') && l.includes('Overall:**')) + .split('|')[1] + .trim() + .replace(/\*/g, '') + .replace(/\:/g, ''); + } + + getDoType() { + return this.inputFile.split('/').slice(-2)[0].replace('ref-organs', 'ref-organ'); + } + + getDoString() { + return [this.getDoType(), this.getName(), this.getVersion()].join('/'); + } + + toJson() { + return { + title: this.getTitle(), + description: this.getDescription(), + + creators: [ + ...this.getAuthors('Creator(s)', 'Creator ORCID(s)'), + ...this.getAuthors('Creator(s)', 'Creator ORCID'), + ], + project_leads: this.getAuthors('Project Lead', 'Project Lead ORCID'), + reviewers: [ + ...this.getAuthors('Reviewer(s)', 'Reviewers ORCID(s)'), + ...this.getAuthors('Reviewer(s)', 'Reviewer ORCID(s)'), + ...this.getAuthors('Internal Reviewer(s)', 'Internal Reviewer ORCID(s)'), + ], + externalReviewers: this.getAuthors('External Reviewer(s)', 'External Reviewer ORCID(s)'), + + creation_date: this.getMetadata('Creation Date') || this.getMetadata('Date'), + creation_year: (this.getMetadata('Creation Date') || this.getMetadata('Date')).split('-')[0], + accessed_date: this.getAccessedDate(this.getMetadata('Creation Date') || this.getMetadata('Date')), + + license: this.getMetadata('License'), + publisher: this.getMetadata('Publisher'), + funders: this.getFunders('Funder', 'Award Number'), + hubmapId: this.getMetadata('HuBMAP ID'), + dataTable: this.getMetadata('Data Table') || this.getMetadata('3D Data') || this.getMetadata('2D Data'), + doi: this.getMetadata('DOI').split('[')[1].split(']')[0], + citation: this.getMetadata(this.getHowToCiteKey()), + citationOverall: this.getMetadata(this.getHowToCiteOverallKey()), + }; + } +} diff --git a/src/migration/ccf-releases/migrate.js b/src/migration/ccf-releases/migrate.js new file mode 100644 index 0000000..25f251b --- /dev/null +++ b/src/migration/ccf-releases/migrate.js @@ -0,0 +1,103 @@ +import { existsSync, writeFileSync } from 'fs'; +import { dump } from 'js-yaml'; +import { resolve } from 'path'; +import sh from 'shelljs'; +import { HraMarkdownParser } from './md-parser.js'; +import { split2dFtuCrosswalk } from './split-2d-ftu-crosswalk.js'; +import { splitRefOrganCrosswalk } from './split-ref-organ-crosswalk.js'; + +function writeDigitalObject(context, md) { + const data = md.toJson(); + // Write out metadata.yaml + const yamlDir = resolve(context.doHome, md.getDoType(), md.getName(), md.getVersion(), 'raw'); + sh.mkdir('-p', yamlDir); + + const dataPaths = data.dataTable + .match(/\(https\:\/\/.*?\)/g) + .map((u) => u.slice(1, -1).split('/').slice(-3).join('/')); + + Object.assign(data, { + type: undefined, + name: undefined, + version: undefined, + creation_year: undefined, + accessed_date: undefined, + dataTable: undefined, + datatable: [], + }); + + for (const inputSrcPath of dataPaths) { + let srcName = inputSrcPath.split('/').slice(-1)[0]; + const srcPath = resolve(context.ccfReleasesPath, inputSrcPath); + let destPath = resolve(yamlDir, srcName); + + sh.cp(srcPath, destPath); + + if (srcPath.endsWith('.zip')) { + srcName = srcName.replace('.zip', ''); + destPath = destPath.replace('.zip', ''); + sh.exec(`unzip -o ${srcPath} -d ${yamlDir} ${srcName}`); + } else if (srcPath.endsWith('.bz2')) { + srcName = srcName.replace('.bz2', ''); + destPath = destPath.replace('.bz2', ''); + sh.exec(`bunzip2 -c ${srcPath} > ${destPath}`); + } + if (srcPath.endsWith('.7z')) { + srcName = srcName.replace('.7z', ''); + destPath = destPath.replace('.7z', ''); + sh.exec(`7z e -aoa ${srcPath} -o${yamlDir} ${srcName}`); + } + + data.datatable.push(srcName); + if (!existsSync(srcPath) || !existsSync(destPath)) { + console.log(md.inputFile, md.getDoType(), srcPath, destPath); + } + } + + if (!md.getName().includes('crosswalk') && (md.getDoType() === 'ref-organ' || md.getDoType() === '2d-ftu')) { + data.datatable.push('crosswalk.csv'); + } + + writeFileSync(yamlDir + '/metadata.yaml', dump(data)); +} + +export function migrateCcfReleases(context) { + const inputDir = context.ccfReleasesPath; + const srcDir = resolve(context.processorHome, 'src/migration/ccf-releases'); + + const allMd = sh + .ls(resolve(inputDir, 'v1.*/markdown/*/*.md')) + .map((s) => s.split('/').slice(-5)) + .map((s) => [s[1], s[3], s[4].replace('.md', '')]); + + const collections = {}; + for (const [collectionVersion, type, name] of allMd) { + const mdFile = resolve(inputDir, `${collectionVersion}/markdown/${type}/${name}.md`); + const parser = new HraMarkdownParser(mdFile); + writeDigitalObject(context, parser); + + collections[collectionVersion] = collections[collectionVersion] || []; + collections[collectionVersion].push(parser.getDoString()); + } + + for (const [version, digitalObjects] of Object.entries(collections)) { + const yamlDir = resolve(context.doHome, `collection/hra/${version}/raw`); + sh.mkdir('-p', yamlDir); + + writeFileSync(yamlDir + '/digital-objects.yaml', dump({ 'digital-objects': digitalObjects })); + + sh.cp(resolve(srcDir, 'hra-metadata.yaml'), yamlDir + '/metadata.yaml'); + + const crosswalk = digitalObjects.find((str) => str.startsWith('2d-ftu/') && str.includes('crosswalk')); + const ftuIllustrations = digitalObjects.filter((str) => str.startsWith('2d-ftu/') && !str.includes('crosswalk')); + for (const doString of ftuIllustrations) { + split2dFtuCrosswalk(context, crosswalk, doString); + } + + const refOrganCrosswalk = digitalObjects.find((str) => str.startsWith('ref-organ/') && str.includes('crosswalk')); + const refOrgans = digitalObjects.filter((str) => str.startsWith('ref-organ/') && !str.includes('crosswalk')); + for (const doString of refOrgans) { + splitRefOrganCrosswalk(context, refOrganCrosswalk, doString); + } + } +} diff --git a/src/migration/ccf-releases/ref-organ-lookup.csv b/src/migration/ccf-releases/ref-organ-lookup.csv new file mode 100644 index 0000000..6fc243a --- /dev/null +++ b/src/migration/ccf-releases/ref-organ-lookup.csv @@ -0,0 +1,81 @@ +glbFile,oldId +Allen_M_Brain,#VHMAllenBrain +SBU_M_Intestine_Large,#VHMColon +SBU_Intestine_Large,#VHMColon +VH_M_Blood_Vasculature,#VHMBloodVasculature +VH_M_Vasculature,#VHMVasculature +3d-vh-m-eye-l,#VHMLeftEye +3d-vh-m-eye-r,#VHMRightEye +VH_M_Eye_L,#VHMLeftEye +VH_M_Eye_R,#VHMRightEye +VH_M_Heart,#VHMHeart +VH_M_Kidney_L,#VHMLeftKidney +VH_M_Kidney_R,#VHMRightKidney +VH_M_Knee_L,#VHMLeftKnee +VH_M_Knee_R,#VHMRightKnee +VH_M_Liver,#VHMLiver +3d-vh-m-lung,#VHMLung +VH_M_lung,#VHMLung +VH_M_Lung,#VHMLung +NIH_M_Lymph_Node_Left,#VHMLeftLymphNode +NIH_M_Lymph_Node_Right,#VHMRightLymphNode +NIH_M_Lymph_Node,#VHMLymphNode +VH_M_Pancreas,#VHMPancreas +VH_M_Pelvis,#VHMPelvis +VH_M_Prostate,#VHMProstate +VH_M_Skin,#VHMSkin +VH_M_Small_Intestine,#VHMSmallIntestine +VH_M_Spleen,#VHMSpleen +VH_M_Thymus,#VHMThymus +VH_M_Ureter_L,#VHMLeftUreter +VH_M_Ureter_R,#VHMRightUreter +VH_M_Urinary_Bladder,#VHMUrinaryBladder +VH_M_Spinal_Cord,#VHMSpinalCord +3d-vh-m-larynx,#VHMLarynx +3d-vh-m-main-bronchus,#VHMMainBronchus +3d-vh-m-palatine-tonsil-l,#VHMLeftPalatineTonsil +3d-vh-m-palatine-tonsil-r,#VHMRightPalatineTonsil +3d-vh-m-trachea,#VHMTrachea +Allen_F_Brain,#VHFAllenBrain +NIH_F_Lymph_Node_Left,#VHFLeftLymphNode +NIH_F_Lymph_Node_Right,#VHFRightLymphNode +NIH_F_Lymph_Node,#VHFLymphNode +SBU_F_Intestine_Large,#VHFLargeIntestine +VH_F_Blood_Vasculature,#VHFBloodVasculature +VH_F_Vasculature,#VHFVasculature +3d-vh-f-eye-l,#VHFLeftEye +3d-vh-f-eye-r,#VHFRightEye +VH_F_Eye_L,#VHFLeftEye +VH_F_Eye_R,#VHFRightEye +VH_F_Fallopian_Tube_L,#VHFLeftFallopianTube +VH_F_Fallopian_Tube_R,#VHFRightFallopianTube +VH_F_Heart,#VHFHeart +VH_F_Kidney_L,#VHFLeftKidney +VH_F_Kidney_R,#VHFRightKidney +VH_F_Knee_L,#VHFLeftKnee +VH_F_Knee_R,#VHFRightKnee +VH_F_Liver,#VHFLiver +3d-vh-f-lung,#VHFLung +VH_F_Lung,#VHFLung +VH_F_Ovary_L,#VHFLeftOvary +VH_F_Ovary_R,#VHFRightOvary +VH_F_Pancreas,#VHFPancreas +VH_F_Pelvis,#VHFPelvis +3d-vh-f-skin,#VHFSkin +VH_F_Skin,#VHFSkin +VH_F_Small_Intestine,#VHFSmallIntestine +VH_F_Spleen,#VHFSpleen +VH_F_Thymus,#VHFThymus +VH_F_Ureter_L,#VHFLeftUreter +VH_F_Ureter_R,#VHFRightUreter +VH_F_Urinary_Bladder,#VHFUrinaryBladder +VH_F_Uterus,#VHFUterus +VH_F_Placenta,#VHFPlacenta +VH_F_Spinal_Cord,#VHFSpinalCord +3d-vh-f-mammary-gland-l,#VHFLeftMammaryGland +3d-vh-f-mammary-gland-r,#VHFRightMammaryGland +3d-vh-f-larynx,#VHFLarynx +3d-vh-f-main-bronchus,#VHFMainBronchus +3d-vh-f-palatine-tonsil-l,#VHFLeftPalatineTonsil +3d-vh-f-palatine-tonsil-r,#VHFRightPalatineTonsil +3d-vh-f-trachea,#VHFTrachea diff --git a/src/migration/ccf-releases/split-2d-ftu-crosswalk.js b/src/migration/ccf-releases/split-2d-ftu-crosswalk.js new file mode 100644 index 0000000..bffd59a --- /dev/null +++ b/src/migration/ccf-releases/split-2d-ftu-crosswalk.js @@ -0,0 +1,66 @@ +import { readFileSync, writeFileSync } from 'fs'; +import { load } from 'js-yaml'; +import Papa from 'papaparse'; +import { resolve } from 'path'; + +export function split2dFtuCrosswalk(context, crosswalkDo, doString) { + const srcDir = resolve(context.processorHome, 'src/migration/ccf-releases'); + + const doName = doString.split('/')[1]; + const LOOKUP = resolve(srcDir, '2d-ftu-lookup.csv'); + const FIRST_COL = 'anatomical_structure_of'; + + const ftuLookupRows = Papa.parse(readFileSync(LOOKUP).toString(), { + header: true, + skipEmptyLines: true, + }).data; + const ftuInfo = ftuLookupRows.find((row) => row.id === doName); + if (!ftuInfo) { + console.log(`can't find ${doName} in ${crosswalkDo}`); + process.exit(); + } + const oldFtuIds = new Set(ftuLookupRows.filter((row) => row.id === doName).map((row) => row.old_id)); + + // Load the full crosswalk + const crosswalkMetaFile = resolve(context.doHome, crosswalkDo, 'raw/metadata.yaml'); + const crosswalkMetadata = load(readFileSync(crosswalkMetaFile).toString()); + const crosswalkFile = crosswalkMetadata.datatable.find((s) => s.includes('.csv')); + const crosswalkPath = resolve(context.doHome, crosswalkDo, 'raw', crosswalkFile); + const crosswalkLines = readFileSync(crosswalkPath).toString().split('\n'); + const headerRow = crosswalkLines.findIndex((line) => line.includes(FIRST_COL)); + const crosswalkText = crosswalkLines.slice(headerRow).join('\n'); + const crosswalkRows = Papa.parse(crosswalkText, { header: true }).data.filter( + (row) => row['OntologyID'] !== '-' && row['OntologyID'] !== '' + ); + + const ftuCrosswalkRows = crosswalkRows + .filter((row) => { + const id = row.anatomical_structure_of; + return oldFtuIds.has(id) || (id === '#FTUAlveolus' && oldFtuIds.has('#FTUAlveoli')); + }) + .map((row) => ({ + node_id: row.node_name, + node_label: row.label, + node_mapped_to: row.OntologyID, + tissue_label: ftuInfo.label, + tissue_mapped_to: ftuInfo.representation_of, + organ_label: row.organ_label || ftuInfo.organ_label, + organ_mapped_to: row.organ_id || ftuInfo.organ_id, + })); + + // Remove duplicates + const seen = new Set(); + const ftuCrosswalk = []; + for (const row of ftuCrosswalkRows) { + if (!seen.has(row.node_id)) { + seen.add(row.node_id); + ftuCrosswalk.push(row); + } + } + + const ftuCrosswalkFile = resolve(context.doHome, doString, 'raw/crosswalk.csv'); + writeFileSync(ftuCrosswalkFile, Papa.unparse(ftuCrosswalk, { header: true })); + if (ftuCrosswalk.length === 0) { + console.log(`Warning (may not be an error): no rows found in ${crosswalkDo} for ${doString}`); + } +} diff --git a/src/migration/ccf-releases/split-ref-organ-crosswalk.js b/src/migration/ccf-releases/split-ref-organ-crosswalk.js new file mode 100644 index 0000000..1aa3b88 --- /dev/null +++ b/src/migration/ccf-releases/split-ref-organ-crosswalk.js @@ -0,0 +1,70 @@ +import { readFileSync, writeFileSync } from 'fs'; +import { load } from 'js-yaml'; +import Papa from 'papaparse'; +import { resolve } from 'path'; + +export function splitRefOrganCrosswalk(context, crosswalkDo, refOrganDo) { + const srcDir = resolve(context.processorHome, 'src/migration/ccf-releases'); + + const LOOKUP = resolve(srcDir, 'ref-organ-lookup.csv'); + const FIRST_COL = 'anatomical_structure_of'; + + // Get reference organ GLB filename + const refOrganMetadataFile = resolve(context.doHome, refOrganDo, 'raw/metadata.yaml'); + const refOrganMetadata = load(readFileSync(refOrganMetadataFile).toString()); + const glbFile = refOrganMetadata.datatable.find((s) => s.includes('.glb')).replace(/\.glb.*/, ''); + + // Find the old id for a reference organ + const refOrganIdLookupRows = Papa.parse(readFileSync(LOOKUP).toString(), { + header: true, + skipEmptyLines: true, + }).data; + const refOrganIdInfo = refOrganIdLookupRows.find( + (row) => glbFile.includes(row.glbFile) || row.glbFile.includes(glbFile) + ); + if (!refOrganIdInfo && !refOrganDo.startsWith('ref-organ/united-')) { + console.log(`can't find lookup from ${crosswalkDo} (${glbFile}) for ${refOrganDo}`); + process.exit(); + } + const refOrgan = refOrganIdInfo?.['oldId']; + + // Load the full crosswalk + const crosswalkMetaFile = resolve(context.doHome, crosswalkDo, 'raw/metadata.yaml'); + const crosswalkMetadata = load(readFileSync(crosswalkMetaFile).toString()); + const crosswalkFile = crosswalkMetadata.datatable.find((s) => s.includes('.csv')); + const crosswalkPath = resolve(context.doHome, crosswalkDo, 'raw', crosswalkFile); + const crosswalkLines = readFileSync(crosswalkPath).toString().split('\n'); + const headerRow = crosswalkLines.findIndex((line) => line.startsWith(FIRST_COL)); + const crosswalkText = crosswalkLines.slice(headerRow).join('\n'); + const crosswalkRows = Papa.parse(crosswalkText, { header: true }).data.filter((row) => row['OntologyID'] !== '-'); + + // Filter the full crosswalk to just the info we need for this reference organ + const refOrganCrosswalk = crosswalkRows + .filter((row) => { + const id = row['anatomical_structure_of']; + + return ( + id.startsWith(refOrgan) || + // united uses all nodes as crosswalk + refOrganDo.startsWith('ref-organ/united-') || + // Some exceptions as IDs have changed over the years + (id === '#VHFColon' && refOrgan === '#VHFLargeIntestine') || + (id === '#VHFLymphNode' && glbFile === 'NIH_F_Lymph_Node') || + (id === '#VHMLymphNode' && glbFile === 'NIH_M_Lymph_Node') + ); + }) + .map((row) => ({ + ...row, + // Special case for a malformed CURIE + OntologyID: row.OntologyID?.toLowerCase() === 'ma:fma46564' ? 'FMA:46564' : row.OntologyID, + })); + + const refOrganCrosswalkFile = resolve(context.doHome, refOrganDo, 'raw/crosswalk.csv'); + writeFileSync( + refOrganCrosswalkFile, + Papa.unparse(refOrganCrosswalk, { header: true, columns: ['node_name', 'OntologyID', 'label'] }) + ); + if (refOrganCrosswalk.length === 0) { + console.log(`no rows found in ${crosswalkDo} for ${refOrganDo}`); + } +}