Skip to content

Commit

Permalink
Create sample lookup JSON objects for frontend
Browse files Browse the repository at this point in the history
  • Loading branch information
dhimmel committed Oct 5, 2016
1 parent 67e175d commit 0ac37c7
Show file tree
Hide file tree
Showing 4 changed files with 7,703 additions and 0 deletions.
264 changes: 264 additions & 0 deletions 5.data-to-JSON.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Export cancer-data to JSON for frontend"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import os\n",
"import json\n",
"import gzip\n",
"\n",
"import pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## gene_to_mutated_samples"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"path = os.path.join('data', 'mutation-matrix.tsv.bz2')\n",
"mutation_mat_df = pandas.read_table(path, index_col=0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gene_to_mutated_samples = dict()\n",
"for entrez_gene_id, series in mutation_mat_df.iteritems():\n",
" gene_to_mutated_samples[int(entrez_gene_id)] = list(series.index[series == 1])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'20.68 MB'"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = os.path.join('data', 'json', 'gene_to_mutated_samples.json')\n",
"with open(path, 'w') as write_file:\n",
" json.dump(gene_to_mutated_samples, write_file, indent=2, sort_keys=True)\n",
"'{:.2f} MB'.format(1e-6 * os.path.getsize(path))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'2.97 MB'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = os.path.join('data', 'json', 'gene_to_mutated_samples.json.gz')\n",
"with gzip.open(path, 'wt') as write_file:\n",
" json.dump(gene_to_mutated_samples, write_file, indent=2, sort_keys=True)\n",
"'{:.2f} MB'.format(1e-6 * os.path.getsize(path))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## disease_to_samples"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_id</th>\n",
" <th>patient_id</th>\n",
" <th>sample_type</th>\n",
" <th>disease</th>\n",
" <th>acronym</th>\n",
" <th>organ_of_origin</th>\n",
" <th>gender</th>\n",
" <th>age_diagnosed</th>\n",
" <th>dead</th>\n",
" <th>days_survived</th>\n",
" <th>recurred</th>\n",
" <th>days_recurrence_free</th>\n",
" <th>n_mutations</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TCGA-02-0047-01</td>\n",
" <td>TCGA-02-0047</td>\n",
" <td>Primary Tumor</td>\n",
" <td>glioblastoma multiforme</td>\n",
" <td>GBM</td>\n",
" <td>Brain</td>\n",
" <td>Male</td>\n",
" <td>78.0</td>\n",
" <td>1.0</td>\n",
" <td>448.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TCGA-02-0055-01</td>\n",
" <td>TCGA-02-0055</td>\n",
" <td>Primary Tumor</td>\n",
" <td>glioblastoma multiforme</td>\n",
" <td>GBM</td>\n",
" <td>Brain</td>\n",
" <td>Female</td>\n",
" <td>62.0</td>\n",
" <td>1.0</td>\n",
" <td>76.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>33</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" sample_id patient_id sample_type disease \\\n",
"0 TCGA-02-0047-01 TCGA-02-0047 Primary Tumor glioblastoma multiforme \n",
"1 TCGA-02-0055-01 TCGA-02-0055 Primary Tumor glioblastoma multiforme \n",
"\n",
" acronym organ_of_origin gender age_diagnosed dead days_survived \\\n",
"0 GBM Brain Male 78.0 1.0 448.0 \n",
"1 GBM Brain Female 62.0 1.0 76.0 \n",
"\n",
" recurred days_recurrence_free n_mutations \n",
"0 NaN NaN 39 \n",
"1 NaN NaN 33 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = os.path.join('data', 'samples.tsv')\n",
"sample_df = pandas.read_table(path)\n",
"sample_df.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"disease_to_samples = {acronym: sorted(sample_ids) for acronym, sample_ids in sample_df.groupby('acronym').sample_id}"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"'0.17 MB'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path = os.path.join('data', 'json', 'disease_to_samples.json')\n",
"with open(path, 'w') as write_file:\n",
" json.dump(disease_to_samples, write_file, indent=2, sort_keys=True)\n",
"'{:.2f} MB'.format(1e-6 * os.path.getsize(path))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
1 change: 1 addition & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
# Ignore compressed files in this directory due to file size
*.bz2
*.gz
Loading

0 comments on commit 0ac37c7

Please sign in to comment.