Use scikit-learn for LDAModel (#607)

* Drop LDA. * Delete 03_lda.py * Use resources instead of test data. * Bundle sklearn model in new class. * More updates. * Fix. * Add test. * Update 03_plot_lda.py * Improve things. * Link to CBMA documentation. * Update 03_plot_lda.py * Update api.rst * More cleanup. * Remove Annotator class. The Annotator and Annotation classes will be developed in #618. * Update 03_plot_lda.py * Remove undefined base class.
neurostuff · Jan 6, 2022 · 7e094f2 · 7e094f2
1 parent 49d68fa
commit 7e094f2
Show file tree

Hide file tree

Showing 9 changed files with 181 additions and 324 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -218,7 +218,6 @@ For more information about fetching data from the internet, see :ref:`fetching t
    extract.fetch_neuroquery
    extract.fetch_neurosynth
    extract.download_nidm_pain
-   extract.download_mallet
    extract.download_cognitive_atlas
    extract.download_abstracts
    extract.download_peaks2maps_model

diff --git a/examples/03_annotation/03_lda.py b/examples/03_annotation/03_lda.py
diff --git a/examples/03_annotation/03_plot_lda.py b/examples/03_annotation/03_plot_lda.py
@@ -0,0 +1,57 @@
+# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
+# ex: set sts=4 ts=4 sw=4 et:
+"""
+
+.. _annotations_lda:
+
+==================
+LDA topic modeling
+==================
+
+This example trains a latent Dirichlet allocation model with scikit-learn
+using abstracts from Neurosynth.
+"""
+import os
+
+import pandas as pd
+
+from nimare import annotate
+from nimare.dataset import Dataset
+from nimare.utils import get_resource_path
+
+###############################################################################
+# Load dataset with abstracts
+# ---------------------------
+dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json"))
+
+###############################################################################
+# Initialize LDA model
+# --------------------
+model = annotate.lda.LDAModel(n_topics=5, max_iter=1000, text_column="abstract")
+
+###############################################################################
+# Run model
+# ---------
+new_dset = model.fit(dset)
+
+###############################################################################
+# View results
+# ------------
+# This DataFrame is very large, so we will only show a slice of it.
+new_dset.annotations[new_dset.annotations.columns[:10]].head(10)
+
+###############################################################################
+# Given that this DataFrame is very wide (many terms), we will transpose it before presenting it.
+model.distributions_["p_topic_g_word_df"].T.head(10)
+
+###############################################################################
+n_top_terms = 10
+top_term_df = model.distributions_["p_topic_g_word_df"].T
+temp_df = top_term_df.copy()
+top_term_df = pd.DataFrame(columns=top_term_df.columns, index=range(n_top_terms))
+top_term_df.index.name = "Token"
+for col in top_term_df.columns:
+    top_tokens = temp_df.sort_values(by=col, ascending=False).index.tolist()[:n_top_terms]
+    top_term_df.loc[:, col] = top_tokens
+
+top_term_df