-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ENH] Support new format for Neurosynth and NeuroQuery data (#535)
* Initial work on new fetcher. * Get fetcher working. * Get conversion working. * Fix test_fetch_neurosynth. * Work on conversion tests. * Fix other test. Now I just need the test files. * Change entity order in new standard. * Fix path. * Update example, add test data, and fix tests. * Add fetch_neuroquery to API. * Clean things up a bit. * Drop ids txt file and add metadata tsv.gz file. * Update test files. * Make some metadata optional. * Add NeuroQuery stuff. * Generalize the download example. Ref #550. * Pin NeuroQuery to commit instead of branch.
- Loading branch information
Showing
14 changed files
with
756 additions
and
151 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,47 +4,95 @@ | |
.. _datasets2: | ||
============================================= | ||
Download and convert the Neurosynth database | ||
============================================= | ||
================================================ | ||
Download the Neurosynth or NeuroQuery databases | ||
================================================ | ||
Download and convert the Neurosynth database (with abstracts) for analysis with | ||
NiMARE. | ||
.. note:: | ||
This will likely change as we work to shift database querying to a remote | ||
database, rather than handling it locally with NiMARE. | ||
.. warning:: | ||
In August 2021, the Neurosynth database was reorganized according to a new file format. | ||
As such, the ``fetch_neurosynth`` function for NiMARE versions before 0.0.10 will not work | ||
with its default parameters. | ||
In order to download the Neurosynth database in its older format using NiMARE <= 0.0.9, | ||
do the following:: | ||
nimare.extract.fetch_neurosynth( | ||
url=( | ||
"https://github.com/neurosynth/neurosynth-data/blob/" | ||
"e8f27c4a9a44dbfbc0750366166ad2ba34ac72d6/current_data.tar.gz?raw=true" | ||
), | ||
) | ||
""" | ||
############################################################################### | ||
# Start with the necessary imports | ||
# -------------------------------- | ||
import os | ||
|
||
from neurosynth.base.dataset import download | ||
from pprint import pprint | ||
|
||
import nimare | ||
|
||
############################################################################### | ||
# Download Neurosynth | ||
# -------------------------------- | ||
# ------------------- | ||
# Neurosynth's data files are stored at https://github.com/neurosynth/neurosynth-data. | ||
out_dir = os.path.abspath("../example_data/") | ||
if not os.path.isdir(out_dir): | ||
os.mkdir(out_dir) | ||
os.makedirs(out_dir, exist_ok=True) | ||
|
||
if not os.path.isfile(os.path.join(out_dir, "database.txt")): | ||
download(out_dir, unpack=True) | ||
files = nimare.extract.fetch_neurosynth( | ||
path=out_dir, | ||
version="7", | ||
overwrite=False, | ||
source="abstract", | ||
vocab="terms", | ||
) | ||
pprint(files) | ||
neurosynth_db = files[0] | ||
|
||
############################################################################### | ||
# Convert Neurosynth database to NiMARE dataset file | ||
# -------------------------------------------------- | ||
dset = nimare.io.convert_neurosynth_to_dataset( | ||
os.path.join(out_dir, "database.txt"), os.path.join(out_dir, "features.txt") | ||
neurosynth_dset = nimare.io.convert_neurosynth_to_dataset( | ||
database_file=neurosynth_db["database"], | ||
annotations_files=neurosynth_db["features"], | ||
) | ||
dset.save(os.path.join(out_dir, "neurosynth_dataset.pkl.gz")) | ||
neurosynth_dset.save(os.path.join(out_dir, "neurosynth_dataset.pkl.gz")) | ||
print(neurosynth_dset) | ||
|
||
############################################################################### | ||
# Add article abstracts to dataset | ||
# -------------------------------- | ||
dset = nimare.extract.download_abstracts(dset, "[email protected]") | ||
dset.save(os.path.join(out_dir, "neurosynth_nimare_with_abstracts.pkl.gz")) | ||
# This is only possible because Neurosynth uses PMIDs as study IDs. | ||
# | ||
# Make sure you replace the example email address with your own. | ||
neurosynth_dset = nimare.extract.download_abstracts(neurosynth_dset, "[email protected]") | ||
neurosynth_dset.save(os.path.join(out_dir, "neurosynth_dataset_with_abstracts.pkl.gz")) | ||
|
||
############################################################################### | ||
# Do the same with NeuroQuery | ||
# --------------------------- | ||
# NeuroQuery's data files are stored at https://github.com/neuroquery/neuroquery_data. | ||
files = nimare.extract.fetch_neuroquery( | ||
path=out_dir, | ||
version="1", | ||
overwrite=False, | ||
source="combined", | ||
vocab="neuroquery7547", | ||
type="tfidf", | ||
) | ||
pprint(files) | ||
neuroquery_db = files[0] | ||
|
||
# Note that the conversion function says "neurosynth". | ||
# This is just for backwards compatibility. | ||
neuroquery_dset = nimare.io.convert_neurosynth_to_dataset( | ||
database_file=neuroquery_db["database"], | ||
annotations_files=neuroquery_db["features"], | ||
) | ||
neuroquery_dset.save(os.path.join(out_dir, "neuroquery_dataset.pkl.gz")) | ||
print(neuroquery_dset) | ||
|
||
# NeuroQuery also uses PMIDs as study IDs. | ||
neuroquery_dset = nimare.extract.download_abstracts(neuroquery_dset, "[email protected]") | ||
neuroquery_dset.save(os.path.join(out_dir, "neuroquery_dataset_with_abstracts.pkl.gz")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.