diff --git a/nbs/99_manuscript/k_max/01-k_max-runs.ipynb b/nbs/99_manuscript/k_max/01-k_max-runs.ipynb new file mode 100644 index 00000000..99126245 --- /dev/null +++ b/nbs/99_manuscript/k_max/01-k_max-runs.ipynb @@ -0,0 +1,412 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c1948eb4-fb63-4fad-8243-bcd57b98def3", + "metadata": { + "papermill": { + "duration": 0.004605, + "end_time": "2024-01-07T09:05:24.164719", + "exception": false, + "start_time": "2024-01-07T09:05:24.160114", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Description" + ] + }, + { + "cell_type": "markdown", + "id": "d5775ff8-d222-4fbc-b1a9-8366662c2896", + "metadata": { + "papermill": { + "duration": 0.003473, + "end_time": "2024-01-07T09:05:24.178680", + "exception": false, + "start_time": "2024-01-07T09:05:24.175207", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "d6252b4c-6d56-4cf0-aecd-5d8c769a3609", + "metadata": { + "papermill": { + "duration": 0.003437, + "end_time": "2024-01-07T09:05:24.187092", + "exception": false, + "start_time": "2024-01-07T09:05:24.183655", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Modules loading" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77e9d29d-5307-4b4a-b103-7d1fbd6a7e56", + "metadata": { + "papermill": { + "duration": 0.477215, + "end_time": "2024-01-07T09:05:24.667904", + "exception": false, + "start_time": "2024-01-07T09:05:24.190689", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from tqdm import tqdm\n", + "\n", + "from ccc import conf\n", + "from ccc.coef import ccc" + ] + }, + { + "cell_type": "markdown", + "id": "262eb535-3e5d-43d7-9efd-bd6cfdac9190", + "metadata": { + "papermill": { + "duration": 0.010475, + "end_time": "2024-01-07T09:05:24.682330", + "exception": false, + "start_time": "2024-01-07T09:05:24.671855", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Settings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1e5a523-e22f-42e1-b27f-929bebaea711", + "metadata": { + "papermill": { + "duration": 0.011834, + "end_time": "2024-01-07T09:05:24.697826", + "exception": false, + "start_time": "2024-01-07T09:05:24.685992", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "display(conf.GENERAL[\"N_JOBS\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d2bd14a-a40d-4710-98c2-587148d0f532", + "metadata": { + "papermill": { + "duration": 0.007589, + "end_time": "2024-01-07T09:05:24.709366", + "exception": false, + "start_time": "2024-01-07T09:05:24.701777", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "DATA_SIZES = [\n", + " 200,\n", + " 600,\n", + " 1800,\n", + "]\n", + "\n", + "# split data size in this many points\n", + "K_MAX_N_SPLITS = 10\n", + "\n", + "# always include this value since it is the default we use in CCC\n", + "DEFAULT_K_MAX = 10\n", + "\n", + "# N_REPS = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d24607ca-2774-4e51-b392-538c85c86052", + "metadata": { + "papermill": { + "duration": 0.007498, + "end_time": "2024-01-07T09:05:24.720708", + "exception": false, + "start_time": "2024-01-07T09:05:24.713210", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "np.random.seed(0)" + ] + }, + { + "cell_type": "markdown", + "id": "eec9bb6d-4ba2-4817-9b36-bc8e68a2beae", + "metadata": { + "papermill": { + "duration": 0.003708, + "end_time": "2024-01-07T09:05:24.728379", + "exception": false, + "start_time": "2024-01-07T09:05:24.724671", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee9e6b70-9abd-4714-9ce5-49c270a1b0be", + "metadata": { + "papermill": { + "duration": 0.008302, + "end_time": "2024-01-07T09:05:24.740578", + "exception": false, + "start_time": "2024-01-07T09:05:24.732276", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "OUTPUT_DIR = conf.RESULTS_DIR / \"k_max_test\"\n", + "OUTPUT_DIR.mkdir(parents=True, exist_ok=True)\n", + "display(OUTPUT_DIR)" + ] + }, + { + "cell_type": "markdown", + "id": "7656ffa5-c9c3-41f5-aeb5-21bde8836e81", + "metadata": { + "papermill": { + "duration": 0.002054, + "end_time": "2024-01-07T09:05:24.787023", + "exception": false, + "start_time": "2024-01-07T09:05:24.784969", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Run" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e4fac59-34a7-4b63-8aa0-e845c0a156af", + "metadata": { + "papermill": { + "duration": 0.007606, + "end_time": "2024-01-07T09:05:24.796789", + "exception": false, + "start_time": "2024-01-07T09:05:24.789183", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# initialize (i.e., compile with numba)\n", + "ccc(np.random.rand(100), np.random.rand(100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3f26ab4-ec59-4e8f-b887-14abc2cc7cae", + "metadata": { + "papermill": { + "duration": 30.612402, + "end_time": "2024-01-07T09:05:55.411465", + "exception": false, + "start_time": "2024-01-07T09:05:24.799063", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "results = pd.DataFrame(columns=[\"data_size\", \"k_max\", \"k_max_as_n_fraction\", \"coef\"])\n", + "\n", + "idx = 0\n", + "for data_size in tqdm(DATA_SIZES):\n", + " # get the values for k_max to try...\n", + " k_max_splits = np.linspace(2, data_size, K_MAX_N_SPLITS)\n", + " # ... but always add the default k_max used by CCC\n", + " k_max_splits = [int(i) for i in np.sort(np.append(k_max_splits, DEFAULT_K_MAX))]\n", + "\n", + " # generate random data\n", + " # TODO: if I generate normal data, what happens?\n", + " # d1 = np.random.rand(data_size)\n", + " # d2 = np.random.rand(data_size)\n", + " d1 = np.random.normal(size=data_size)\n", + " d2 = np.random.normal(size=data_size)\n", + "\n", + " for k_max in tqdm(k_max_splits):\n", + " c = ccc(d1, d2, internal_n_clusters=k_max, n_jobs=conf.GENERAL[\"N_JOBS\"])\n", + "\n", + " results.loc[idx] = [data_size, k_max, k_max / data_size, c]\n", + " idx += 1\n", + "\n", + " # save\n", + " results.to_pickle(OUTPUT_DIR / \"k_max-results.pkl\")" + ] + }, + { + "cell_type": "markdown", + "id": "2597ad7d", + "metadata": { + "papermill": { + "duration": 0.004923, + "end_time": "2024-01-07T09:05:55.421440", + "exception": false, + "start_time": "2024-01-07T09:05:55.416517", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0772878d-2622-453a-9cad-01deb048bd37", + "metadata": { + "papermill": { + "duration": 0.008207, + "end_time": "2024-01-07T09:05:55.434636", + "exception": false, + "start_time": "2024-01-07T09:05:55.426429", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "results.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5cae452-fab0-46be-a659-2a7425d7e44c", + "metadata": { + "papermill": { + "duration": 0.007016, + "end_time": "2024-01-07T09:05:55.445283", + "exception": false, + "start_time": "2024-01-07T09:05:55.438267", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "assert results.shape[0] == int(len(DATA_SIZES) * (K_MAX_N_SPLITS + 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7d542e8-fa10-452a-8ab5-0677a90a32ef", + "metadata": { + "papermill": { + "duration": 0.012153, + "end_time": "2024-01-07T09:05:55.461216", + "exception": false, + "start_time": "2024-01-07T09:05:55.449063", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "results.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d47f3e8-99fd-4c4a-b6e4-eda88e932ab7", + "metadata": { + "papermill": { + "duration": 0.003641, + "end_time": "2024-01-07T09:05:55.478643", + "exception": false, + "start_time": "2024-01-07T09:05:55.475002", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "jupytext": { + "cell_metadata_filter": "all,-execution,-papermill,-trusted", + "notebook_metadata_filter": "-jupytext.text_representation.jupytext_version" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + }, + "papermill": { + "default_parameters": {}, + "duration": 32.54247, + "end_time": "2024-01-07T09:05:55.797737", + "environment_variables": {}, + "exception": null, + "input_path": "nbs/99_manuscript/k_max/01-k_max-runs.ipynb", + "output_path": "nbs/99_manuscript/k_max/01-k_max-runs.run.ipynb", + "parameters": { + "GTEX_TISSUE": "skin_sun_exposed_lower_leg" + }, + "start_time": "2024-01-07T09:05:23.255267", + "version": "2.3.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/nbs/99_manuscript/k_max/py/01-k_max-runs.py b/nbs/99_manuscript/k_max/py/01-k_max-runs.py new file mode 100644 index 00000000..c688ec6f --- /dev/null +++ b/nbs/99_manuscript/k_max/py/01-k_max-runs.py @@ -0,0 +1,110 @@ +# --- +# jupyter: +# jupytext: +# cell_metadata_filter: all,-execution,-papermill,-trusted +# notebook_metadata_filter: -jupytext.text_representation.jupytext_version +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# %% [markdown] tags=[] +# # Description + +# %% [markdown] tags=[] +# TODO + +# %% [markdown] tags=[] +# # Modules loading + +# %% tags=[] +import numpy as np +import pandas as pd +from tqdm import tqdm + +from ccc import conf +from ccc.coef import ccc + +# %% [markdown] tags=[] +# # Settings + +# %% tags=[] +display(conf.GENERAL["N_JOBS"]) + +# %% tags=[] +DATA_SIZES = [ + 200, + 600, + 1800, +] + +# split data size in this many points +K_MAX_N_SPLITS = 10 + +# always include this value since it is the default we use in CCC +DEFAULT_K_MAX = 10 + +# N_REPS = 10 + +# %% tags=[] +np.random.seed(0) + +# %% [markdown] tags=[] +# # Paths + +# %% tags=[] +OUTPUT_DIR = conf.RESULTS_DIR / "k_max_test" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) +display(OUTPUT_DIR) + +# %% [markdown] tags=[] +# # Run + +# %% tags=[] +# initialize (i.e., compile with numba) +ccc(np.random.rand(100), np.random.rand(100)) + +# %% tags=[] +results = pd.DataFrame(columns=["data_size", "k_max", "k_max_as_n_fraction", "coef"]) + +idx = 0 +for data_size in tqdm(DATA_SIZES): + # get the values for k_max to try... + k_max_splits = np.linspace(2, data_size, K_MAX_N_SPLITS) + # ... but always add the default k_max used by CCC + k_max_splits = [int(i) for i in np.sort(np.append(k_max_splits, DEFAULT_K_MAX))] + + # generate random data + # TODO: if I generate normal data, what happens? + # d1 = np.random.rand(data_size) + # d2 = np.random.rand(data_size) + d1 = np.random.normal(size=data_size) + d2 = np.random.normal(size=data_size) + + for k_max in tqdm(k_max_splits): + c = ccc(d1, d2, internal_n_clusters=k_max, n_jobs=conf.GENERAL["N_JOBS"]) + + results.loc[idx] = [data_size, k_max, k_max / data_size, c] + idx += 1 + + # save + results.to_pickle(OUTPUT_DIR / "k_max-results.pkl") + +# %% [markdown] tags=[] +# # Check + +# %% tags=[] +results.shape + +# %% tags=[] +assert results.shape[0] == int(len(DATA_SIZES) * (K_MAX_N_SPLITS + 1)) + +# %% tags=[] +results.head() + +# %% tags=[]