-
Notifications
You must be signed in to change notification settings - Fork 58
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use scikit-learn for LDAModel (#607)
* Drop LDA. * Delete 03_lda.py * Use resources instead of test data. * Bundle sklearn model in new class. * More updates. * Fix. * Add test. * Update 03_plot_lda.py * Improve things. * Link to CBMA documentation. * Update 03_plot_lda.py * Update api.rst * More cleanup. * Remove Annotator class. The Annotator and Annotation classes will be developed in #618. * Update 03_plot_lda.py * Remove undefined base class.
- Loading branch information
Showing
9 changed files
with
181 additions
and
324 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- | ||
# ex: set sts=4 ts=4 sw=4 et: | ||
""" | ||
.. _annotations_lda: | ||
================== | ||
LDA topic modeling | ||
================== | ||
This example trains a latent Dirichlet allocation model with scikit-learn | ||
using abstracts from Neurosynth. | ||
""" | ||
import os | ||
|
||
import pandas as pd | ||
|
||
from nimare import annotate | ||
from nimare.dataset import Dataset | ||
from nimare.utils import get_resource_path | ||
|
||
############################################################################### | ||
# Load dataset with abstracts | ||
# --------------------------- | ||
dset = Dataset(os.path.join(get_resource_path(), "neurosynth_laird_studies.json")) | ||
|
||
############################################################################### | ||
# Initialize LDA model | ||
# -------------------- | ||
model = annotate.lda.LDAModel(n_topics=5, max_iter=1000, text_column="abstract") | ||
|
||
############################################################################### | ||
# Run model | ||
# --------- | ||
new_dset = model.fit(dset) | ||
|
||
############################################################################### | ||
# View results | ||
# ------------ | ||
# This DataFrame is very large, so we will only show a slice of it. | ||
new_dset.annotations[new_dset.annotations.columns[:10]].head(10) | ||
|
||
############################################################################### | ||
# Given that this DataFrame is very wide (many terms), we will transpose it before presenting it. | ||
model.distributions_["p_topic_g_word_df"].T.head(10) | ||
|
||
############################################################################### | ||
n_top_terms = 10 | ||
top_term_df = model.distributions_["p_topic_g_word_df"].T | ||
temp_df = top_term_df.copy() | ||
top_term_df = pd.DataFrame(columns=top_term_df.columns, index=range(n_top_terms)) | ||
top_term_df.index.name = "Token" | ||
for col in top_term_df.columns: | ||
top_tokens = temp_df.sort_values(by=col, ascending=False).index.tolist()[:n_top_terms] | ||
top_term_df.loc[:, col] = top_tokens | ||
|
||
top_term_df |
Oops, something went wrong.