Create sample lookup JSON objects for frontend

cognoma · Oct 5, 2016 · 0ac37c7 · 0ac37c7
1 parent 67e175d
commit 0ac37c7
Show file tree

Hide file tree

Showing 4 changed files with 7,703 additions and 0 deletions.
diff --git a/5.data-to-JSON.ipynb b/5.data-to-JSON.ipynb
@@ -0,0 +1,264 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Export cancer-data to JSON for frontend"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import json\n",
+    "import gzip\n",
+    "\n",
+    "import pandas"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## gene_to_mutated_samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "path = os.path.join('data', 'mutation-matrix.tsv.bz2')\n",
+    "mutation_mat_df = pandas.read_table(path, index_col=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "gene_to_mutated_samples = dict()\n",
+    "for entrez_gene_id, series in mutation_mat_df.iteritems():\n",
+    "    gene_to_mutated_samples[int(entrez_gene_id)] = list(series.index[series == 1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'20.68 MB'"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "path = os.path.join('data', 'json', 'gene_to_mutated_samples.json')\n",
+    "with open(path, 'w') as write_file:\n",
+    "    json.dump(gene_to_mutated_samples, write_file, indent=2, sort_keys=True)\n",
+    "'{:.2f} MB'.format(1e-6 * os.path.getsize(path))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'2.97 MB'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "path = os.path.join('data', 'json', 'gene_to_mutated_samples.json.gz')\n",
+    "with gzip.open(path, 'wt') as write_file:\n",
+    "    json.dump(gene_to_mutated_samples, write_file, indent=2, sort_keys=True)\n",
+    "'{:.2f} MB'.format(1e-6 * os.path.getsize(path))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## disease_to_samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>patient_id</th>\n",
+       "      <th>sample_type</th>\n",
+       "      <th>disease</th>\n",
+       "      <th>acronym</th>\n",
+       "      <th>organ_of_origin</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>age_diagnosed</th>\n",
+       "      <th>dead</th>\n",
+       "      <th>days_survived</th>\n",
+       "      <th>recurred</th>\n",
+       "      <th>days_recurrence_free</th>\n",
+       "      <th>n_mutations</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TCGA-02-0047-01</td>\n",
+       "      <td>TCGA-02-0047</td>\n",
+       "      <td>Primary Tumor</td>\n",
+       "      <td>glioblastoma multiforme</td>\n",
+       "      <td>GBM</td>\n",
+       "      <td>Brain</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>78.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>448.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>39</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>TCGA-02-0055-01</td>\n",
+       "      <td>TCGA-02-0055</td>\n",
+       "      <td>Primary Tumor</td>\n",
+       "      <td>glioblastoma multiforme</td>\n",
+       "      <td>GBM</td>\n",
+       "      <td>Brain</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>62.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>76.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>33</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         sample_id    patient_id    sample_type                  disease  \\\n",
+       "0  TCGA-02-0047-01  TCGA-02-0047  Primary Tumor  glioblastoma multiforme   \n",
+       "1  TCGA-02-0055-01  TCGA-02-0055  Primary Tumor  glioblastoma multiforme   \n",
+       "\n",
+       "  acronym organ_of_origin  gender  age_diagnosed  dead  days_survived  \\\n",
+       "0     GBM           Brain    Male           78.0   1.0          448.0   \n",
+       "1     GBM           Brain  Female           62.0   1.0           76.0   \n",
+       "\n",
+       "   recurred  days_recurrence_free  n_mutations  \n",
+       "0       NaN                   NaN           39  \n",
+       "1       NaN                   NaN           33  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "path = os.path.join('data', 'samples.tsv')\n",
+    "sample_df = pandas.read_table(path)\n",
+    "sample_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "disease_to_samples = {acronym: sorted(sample_ids) for acronym, sample_ids in sample_df.groupby('acronym').sample_id}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'0.17 MB'"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "path = os.path.join('data', 'json', 'disease_to_samples.json')\n",
+    "with open(path, 'w') as write_file:\n",
+    "    json.dump(disease_to_samples, write_file, indent=2, sort_keys=True)\n",
+    "'{:.2f} MB'.format(1e-6 * os.path.getsize(path))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/data/.gitignore b/data/.gitignore
@@ -1,2 +1,3 @@
 # Ignore compressed files in this directory due to file size
 *.bz2
+*.gz