diff --git a/OMOP_demo.ipynb b/OMOP_demo.ipynb
new file mode 100644
index 0000000..fd738f6
--- /dev/null
+++ b/OMOP_demo.ipynb
@@ -0,0 +1,1065 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/anaconda3/envs/ehrapy_latents/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "from ehrdata import OMOP\n",
+ "# Using MIMIC-IV demo data in the OMOP Common Data Model\n",
+ "# Download link: https://physionet.org/content/mimic-iv-demo-omop/0.9/"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "omop = OMOP(file_paths=\"/Users/xinyuezhang/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[1;35m2024-01-04 13:38:37,971\u001b[0m - \u001b[1;34mroot\u001b[0m \u001b[1;30mINFO - Transformed passed dataframe into an AnnData object with n_obs x n_vars = `852` x `2`.\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Load some tables\n",
+ "adata = omop.load(level='stay_level')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# think about where to store these statistics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Explore most frequent n features in measurement table\n",
+ "feature_counts = omop.feature_statistics(source='measurement', number=20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " feature_name \n",
+ " measurement_concept_id_1 \n",
+ " measurement_concept_id_2 \n",
+ " count \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 \n",
+ " Respiratory Rate \n",
+ " 2000030002 \n",
+ " 3024171 \n",
+ " 19776 \n",
+ " \n",
+ " \n",
+ " 1 \n",
+ " Heart Rate \n",
+ " 2000030001 \n",
+ " 3027018 \n",
+ " 19319 \n",
+ " \n",
+ " \n",
+ " 2 \n",
+ " O2 saturation pulseoxymetry \n",
+ " 2000030003 \n",
+ " 40762499 \n",
+ " 17683 \n",
+ " \n",
+ " \n",
+ " 3 \n",
+ " Heart Rhythm \n",
+ " 2000030004 \n",
+ " 3022318 \n",
+ " 12441 \n",
+ " \n",
+ " \n",
+ " 4 \n",
+ " ART BP Mean \n",
+ " 2000030096 \n",
+ " 3027598 \n",
+ " 9738 \n",
+ " \n",
+ " \n",
+ " 5 \n",
+ " Arterial Blood Pressure systolic \n",
+ " 2000030009 \n",
+ " 3004249 \n",
+ " 9661 \n",
+ " \n",
+ " \n",
+ " 6 \n",
+ " Manual Blood Pressure Diastolic Right \n",
+ " 2000030157 \n",
+ " 3012888 \n",
+ " 9660 \n",
+ " \n",
+ " \n",
+ " 7 \n",
+ " Non Invasive Blood Pressure diastolic \n",
+ " 2000030006 \n",
+ " 21492240 \n",
+ " 8373 \n",
+ " \n",
+ " \n",
+ " 8 \n",
+ " Non Invasive Blood Pressure systolic \n",
+ " 2000030005 \n",
+ " 21492239 \n",
+ " 8371 \n",
+ " \n",
+ " \n",
+ " 9 \n",
+ " Non Invasive Blood Pressure mean \n",
+ " 2000030007 \n",
+ " 21492241 \n",
+ " 8366 \n",
+ " \n",
+ " \n",
+ " 10 \n",
+ " 0 \n",
+ " 0 \n",
+ " 0 \n",
+ " 6850 \n",
+ " \n",
+ " \n",
+ " 11 \n",
+ " Temperature Celsius \n",
+ " 2000030092 \n",
+ " 3020891 \n",
+ " 4057 \n",
+ " \n",
+ " \n",
+ " 12 \n",
+ " Potassium (serum) \n",
+ " 2000030062 \n",
+ " 3023103 \n",
+ " 3970 \n",
+ " \n",
+ " \n",
+ " 13 \n",
+ " Sodium (serum) \n",
+ " 2000030063 \n",
+ " 3019550 \n",
+ " 3954 \n",
+ " \n",
+ " \n",
+ " 14 \n",
+ " Chloride (serum) \n",
+ " 2000030064 \n",
+ " 3014576 \n",
+ " 3933 \n",
+ " \n",
+ " \n",
+ " 15 \n",
+ " Creatinine (serum) \n",
+ " 2000030068 \n",
+ " 3016723 \n",
+ " 3886 \n",
+ " \n",
+ " \n",
+ " 16 \n",
+ " Hemoglobin|Blood|Blood Gas \n",
+ " 2000001008 \n",
+ " 3000963 \n",
+ " 3778 \n",
+ " \n",
+ " \n",
+ " 17 \n",
+ " HCO3 (serum) \n",
+ " 2000030069 \n",
+ " 3016293 \n",
+ " 3763 \n",
+ " \n",
+ " \n",
+ " 18 \n",
+ " Glucose (serum) \n",
+ " 2000030072 \n",
+ " 3004501 \n",
+ " 3597 \n",
+ " \n",
+ " \n",
+ " 19 \n",
+ " GCS - Eye Opening \n",
+ " 2000030013 \n",
+ " 3016335 \n",
+ " 3268 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " feature_name measurement_concept_id_1 \\\n",
+ "0 Respiratory Rate 2000030002 \n",
+ "1 Heart Rate 2000030001 \n",
+ "2 O2 saturation pulseoxymetry 2000030003 \n",
+ "3 Heart Rhythm 2000030004 \n",
+ "4 ART BP Mean 2000030096 \n",
+ "5 Arterial Blood Pressure systolic 2000030009 \n",
+ "6 Manual Blood Pressure Diastolic Right 2000030157 \n",
+ "7 Non Invasive Blood Pressure diastolic 2000030006 \n",
+ "8 Non Invasive Blood Pressure systolic 2000030005 \n",
+ "9 Non Invasive Blood Pressure mean 2000030007 \n",
+ "10 0 0 \n",
+ "11 Temperature Celsius 2000030092 \n",
+ "12 Potassium (serum) 2000030062 \n",
+ "13 Sodium (serum) 2000030063 \n",
+ "14 Chloride (serum) 2000030064 \n",
+ "15 Creatinine (serum) 2000030068 \n",
+ "16 Hemoglobin|Blood|Blood Gas 2000001008 \n",
+ "17 HCO3 (serum) 2000030069 \n",
+ "18 Glucose (serum) 2000030072 \n",
+ "19 GCS - Eye Opening 2000030013 \n",
+ "\n",
+ " measurement_concept_id_2 count \n",
+ "0 3024171 19776 \n",
+ "1 3027018 19319 \n",
+ "2 40762499 17683 \n",
+ "3 3022318 12441 \n",
+ "4 3027598 9738 \n",
+ "5 3004249 9661 \n",
+ "6 3012888 9660 \n",
+ "7 21492240 8373 \n",
+ "8 21492239 8371 \n",
+ "9 21492241 8366 \n",
+ "10 0 6850 \n",
+ "11 3020891 4057 \n",
+ "12 3023103 3970 \n",
+ "13 3019550 3954 \n",
+ "14 3014576 3933 \n",
+ "15 3016723 3886 \n",
+ "16 3000963 3778 \n",
+ "17 3016293 3763 \n",
+ "18 3004501 3597 \n",
+ "19 3016335 3268 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "feature_counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "adata = omop.extract_features(adata, source='measurement', features=[2000030003, 'Respiratory Rate'], map_concept=True, add_aggregation_to_X=False, verbose=True, remove_empty_column=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "AnnData object with n_obs × n_vars = 852 × 2\n",
+ " obs: 'preceding_visit_occurrence_id', 'race_source_concept_id', 'discharge_to_concept_id', 'discharge_to_source_value', 'death_datetime', 'visit_concept_id', 'race_source_value', 'gender_source_concept_id', 'cause_source_concept_id', 'cause_concept_id', 'death_type_concept_id', 'provider_id_y', 'care_site_id_y', 'provider_id_x', 'visit_end_date', 'month_of_birth', 'visit_type_concept_id', 'visit_source_concept_id', 'day_of_birth', 'cause_source_value', 'birth_datetime', 'visit_start_datetime', 'race_concept_id', 'care_site_id_x', 'visit_end_datetime', 'admitting_source_concept_id', 'location_id', 'ethnicity_source_value', 'visit_source_value', 'gender_concept_id', 'ethnicity_concept_id', 'visit_start_date', 'ethnicity_source_concept_id', 'person_source_value', 'death_date', 'admitting_source_value'\n",
+ " uns: 'numerical_columns', 'non_numerical_columns'\n",
+ " obsm: 'O2 saturation pulseoxymetry'\n",
+ " layers: 'original'"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "adata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Pain Level Response , feature ID 3034263 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mPain Level Response\u001b[0m, feature ID \u001b[1;32m3034263\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Pain Level Acceptable , feature ID 4138763 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mPain Level Acceptable\u001b[0m, feature ID \u001b[1;32m4138763\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Pain Present , feature ID 40757693 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mPain Present\u001b[0m, feature ID \u001b[1;32m40757693\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Temperature Site , feature ID 3024265 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mTemperature Site\u001b[0m, feature ID \u001b[1;32m3024265\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Oral Care , feature ID 3039006 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mOral Care\u001b[0m, feature ID \u001b[1;32m3039006\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Bowel Sounds , feature ID 4337265 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mBowel Sounds\u001b[0m, feature ID \u001b[1;32m4337265\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Past medical history , feature ID 3001062 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mPast medical history\u001b[0m, feature ID \u001b[1;32m3001062\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Respiratory Effort , feature ID 21492835 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mRespiratory Effort\u001b[0m, feature ID \u001b[1;32m21492835\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Code Status , feature ID 4127294 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mCode Status\u001b[0m, feature ID \u001b[1;32m4127294\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Pain Management , feature ID 4192791 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mPain Management\u001b[0m, feature ID \u001b[1;32m4192791\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature O2 Delivery Device( s ) , feature ID 4036936 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mO2 Delivery \u001b[0m\u001b[1;32mDevice\u001b[0m\u001b[1;32m(\u001b[0m\u001b[32ms\u001b[0m\u001b[1;32m)\u001b[0m, feature ID \u001b[1;32m4036936\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Fall , feature ID 436583 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mFall\u001b[0m, feature ID \u001b[1;32m436583\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Ventilator Type , feature ID 3007397 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mVentilator Type\u001b[0m, feature ID \u001b[1;32m3007397\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Family updated by MD , feature ID 46273928 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mFamily updated by MD\u001b[0m, feature ID \u001b[1;32m46273928\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature SVV ( Arterial ) , feature ID 37116693 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mSVV \u001b[0m\u001b[1;32m(\u001b[0m\u001b[32mArterial\u001b[0m\u001b[1;32m)\u001b[0m, feature ID \u001b[1;32m37116693\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Tobacco Use History , feature ID 3012697 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mTobacco Use History\u001b[0m, feature ID \u001b[1;32m3012697\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Family meeting attempted, unable , feature ID 46272666 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mFamily meeting attempted, unable\u001b[0m, feature ID \u001b[1;32m46272666\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature OR Sent , feature ID 4162219 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mOR Sent\u001b[0m, feature ID \u001b[1;32m4162219\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature MRSA SCREEN , feature ID 37397888 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mMRSA SCREEN\u001b[0m, feature ID \u001b[1;32m37397888\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature OR Received , feature ID 4265599 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mOR Received\u001b[0m, feature ID \u001b[1;32m4265599\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature ICP Line Insertion Date , feature ID 4048955 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mICP Line Insertion Date\u001b[0m, feature ID \u001b[1;32m4048955\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Couldn't find concept [ 4215685 , 440922 , 44784283 , 4296248 , 2617452 , 4214956 , 4251171 , 40766231 , 40758030 , 46235654 ,\n",
+ "435928 , 443345 , 4179084 , 4310926 , 4306655 , 4207283 , 46273937 , 4119499 , 4323360 , 4311079 , 4239756 , 4203722 , 433942 , \n",
+ "4256640 , 440279 , 4324190 , 4179963 , 40479553 , 46272451 , 440927 , 4060985 , 439221 , 4314870 , 432789 , 4233464 , 433930 , \n",
+ "46272450 , 45766277 , 4276526 , 4135421 , 437165 , 4167217 , 441764 , 4180749 , 46272734 , 37016200 , 4060705 , 4168834 , \n",
+ "4182335 , 4179242 , 441749 , 4019967 , 4143274 , 4178782 , 4014023 , 4324321 , 4323345 , 439228 , 441192 , 439222 , 439996 , \n",
+ "46274064 , 4059907 , 36717001 , 4095498 , 43021271 , 4005823 , 46270032 , 4148407 , 46274011 , 4315085 , 441751 , 436882 , \n",
+ "441744 , 4279614 , 4144274 , 438046 , 4322482 , 44782983 , 443367 , 4162908 , 438950 , 4022569 , 42539646 , 4140830 , 4324181 , \n",
+ "441207 , 43020581 , 4139934 , 4168192 , 40481022 , 433658 , 443370 , 4096435 , 4152408 , 4147565 , 4060084 , 4305831 , \n",
+ "43021273 , 4059910 , 437175 , 36716245 , 436315 , 439371 , 4310065 , 42709971 , 4058728 , 4311077 , 4190634 , 4305297 , \n",
+ "4325851 , 437483 , 4216727 , 4334494 , 441206 , 40481819 , 4312972 , 443340 , 4327107 , 4144292 , 4310928 , 442165 , 4330220 , \n",
+ "443548 , 4179214 , 37016166 , 437738 , 4323344 , 4175576 , 436292 ] in concept table!\n",
+ " \n"
+ ],
+ "text/plain": [
+ "Couldn't find concept \u001b[1m[\u001b[0m\u001b[1;36m4215685\u001b[0m, \u001b[1;36m440922\u001b[0m, \u001b[1;36m44784283\u001b[0m, \u001b[1;36m4296248\u001b[0m, \u001b[1;36m2617452\u001b[0m, \u001b[1;36m4214956\u001b[0m, \u001b[1;36m4251171\u001b[0m, \u001b[1;36m40766231\u001b[0m, \u001b[1;36m40758030\u001b[0m, \u001b[1;36m46235654\u001b[0m,\n",
+ "\u001b[1;36m435928\u001b[0m, \u001b[1;36m443345\u001b[0m, \u001b[1;36m4179084\u001b[0m, \u001b[1;36m4310926\u001b[0m, \u001b[1;36m4306655\u001b[0m, \u001b[1;36m4207283\u001b[0m, \u001b[1;36m46273937\u001b[0m, \u001b[1;36m4119499\u001b[0m, \u001b[1;36m4323360\u001b[0m, \u001b[1;36m4311079\u001b[0m, \u001b[1;36m4239756\u001b[0m, \u001b[1;36m4203722\u001b[0m, \u001b[1;36m433942\u001b[0m, \n",
+ "\u001b[1;36m4256640\u001b[0m, \u001b[1;36m440279\u001b[0m, \u001b[1;36m4324190\u001b[0m, \u001b[1;36m4179963\u001b[0m, \u001b[1;36m40479553\u001b[0m, \u001b[1;36m46272451\u001b[0m, \u001b[1;36m440927\u001b[0m, \u001b[1;36m4060985\u001b[0m, \u001b[1;36m439221\u001b[0m, \u001b[1;36m4314870\u001b[0m, \u001b[1;36m432789\u001b[0m, \u001b[1;36m4233464\u001b[0m, \u001b[1;36m433930\u001b[0m, \n",
+ "\u001b[1;36m46272450\u001b[0m, \u001b[1;36m45766277\u001b[0m, \u001b[1;36m4276526\u001b[0m, \u001b[1;36m4135421\u001b[0m, \u001b[1;36m437165\u001b[0m, \u001b[1;36m4167217\u001b[0m, \u001b[1;36m441764\u001b[0m, \u001b[1;36m4180749\u001b[0m, \u001b[1;36m46272734\u001b[0m, \u001b[1;36m37016200\u001b[0m, \u001b[1;36m4060705\u001b[0m, \u001b[1;36m4168834\u001b[0m, \n",
+ "\u001b[1;36m4182335\u001b[0m, \u001b[1;36m4179242\u001b[0m, \u001b[1;36m441749\u001b[0m, \u001b[1;36m4019967\u001b[0m, \u001b[1;36m4143274\u001b[0m, \u001b[1;36m4178782\u001b[0m, \u001b[1;36m4014023\u001b[0m, \u001b[1;36m4324321\u001b[0m, \u001b[1;36m4323345\u001b[0m, \u001b[1;36m439228\u001b[0m, \u001b[1;36m441192\u001b[0m, \u001b[1;36m439222\u001b[0m, \u001b[1;36m439996\u001b[0m, \n",
+ "\u001b[1;36m46274064\u001b[0m, \u001b[1;36m4059907\u001b[0m, \u001b[1;36m36717001\u001b[0m, \u001b[1;36m4095498\u001b[0m, \u001b[1;36m43021271\u001b[0m, \u001b[1;36m4005823\u001b[0m, \u001b[1;36m46270032\u001b[0m, \u001b[1;36m4148407\u001b[0m, \u001b[1;36m46274011\u001b[0m, \u001b[1;36m4315085\u001b[0m, \u001b[1;36m441751\u001b[0m, \u001b[1;36m436882\u001b[0m, \n",
+ "\u001b[1;36m441744\u001b[0m, \u001b[1;36m4279614\u001b[0m, \u001b[1;36m4144274\u001b[0m, \u001b[1;36m438046\u001b[0m, \u001b[1;36m4322482\u001b[0m, \u001b[1;36m44782983\u001b[0m, \u001b[1;36m443367\u001b[0m, \u001b[1;36m4162908\u001b[0m, \u001b[1;36m438950\u001b[0m, \u001b[1;36m4022569\u001b[0m, \u001b[1;36m42539646\u001b[0m, \u001b[1;36m4140830\u001b[0m, \u001b[1;36m4324181\u001b[0m, \n",
+ "\u001b[1;36m441207\u001b[0m, \u001b[1;36m43020581\u001b[0m, \u001b[1;36m4139934\u001b[0m, \u001b[1;36m4168192\u001b[0m, \u001b[1;36m40481022\u001b[0m, \u001b[1;36m433658\u001b[0m, \u001b[1;36m443370\u001b[0m, \u001b[1;36m4096435\u001b[0m, \u001b[1;36m4152408\u001b[0m, \u001b[1;36m4147565\u001b[0m, \u001b[1;36m4060084\u001b[0m, \u001b[1;36m4305831\u001b[0m, \n",
+ "\u001b[1;36m43021273\u001b[0m, \u001b[1;36m4059910\u001b[0m, \u001b[1;36m437175\u001b[0m, \u001b[1;36m36716245\u001b[0m, \u001b[1;36m436315\u001b[0m, \u001b[1;36m439371\u001b[0m, \u001b[1;36m4310065\u001b[0m, \u001b[1;36m42709971\u001b[0m, \u001b[1;36m4058728\u001b[0m, \u001b[1;36m4311077\u001b[0m, \u001b[1;36m4190634\u001b[0m, \u001b[1;36m4305297\u001b[0m, \n",
+ "\u001b[1;36m4325851\u001b[0m, \u001b[1;36m437483\u001b[0m, \u001b[1;36m4216727\u001b[0m, \u001b[1;36m4334494\u001b[0m, \u001b[1;36m441206\u001b[0m, \u001b[1;36m40481819\u001b[0m, \u001b[1;36m4312972\u001b[0m, \u001b[1;36m443340\u001b[0m, \u001b[1;36m4327107\u001b[0m, \u001b[1;36m4144292\u001b[0m, \u001b[1;36m4310928\u001b[0m, \u001b[1;36m442165\u001b[0m, \u001b[1;36m4330220\u001b[0m, \n",
+ "\u001b[1;36m443548\u001b[0m, \u001b[1;36m4179214\u001b[0m, \u001b[1;36m37016166\u001b[0m, \u001b[1;36m437738\u001b[0m, \u001b[1;36m4323344\u001b[0m, \u001b[1;36m4175576\u001b[0m, \u001b[1;36m436292\u001b[0m\u001b[1m]\u001b[0m in concept table!\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "adata = omop.extract_features(adata, source='observation', map_concept=True, add_aggregation_to_X=False, verbose=True, remove_empty_column=True, columns_in_source_table = ['observation_id', 'observation_datetime', 'observation_type_concept_id', 'value_as_number'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "AnnData object with n_obs × n_vars = 852 × 2\n",
+ " obs: 'discharge_to_concept_id', 'death_date', 'ethnicity_source_concept_id', 'visit_start_date', 'gender_source_concept_id', 'discharge_to_source_value', 'provider_id_y', 'visit_source_concept_id', 'death_type_concept_id', 'cause_source_value', 'care_site_id_x', 'preceding_visit_occurrence_id', 'race_source_value', 'admitting_source_value', 'cause_concept_id', 'visit_source_value', 'gender_concept_id', 'race_concept_id', 'care_site_id_y', 'admitting_source_concept_id', 'birth_datetime', 'location_id', 'race_source_concept_id', 'visit_type_concept_id', 'provider_id_x', 'visit_start_datetime', 'visit_concept_id', 'cause_source_concept_id', 'visit_end_datetime', 'day_of_birth', 'visit_end_date', 'death_datetime', 'ethnicity_source_value', 'ethnicity_concept_id', 'person_source_value', 'month_of_birth'\n",
+ " uns: 'numerical_columns', 'non_numerical_columns'\n",
+ " layers: 'original'"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "adata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "discharge_to_concept_id Int64\n",
+ "death_date category\n",
+ "ethnicity_source_concept_id Int64\n",
+ "visit_start_date category\n",
+ "gender_source_concept_id Int64\n",
+ "discharge_to_source_value category\n",
+ "provider_id_y Int64\n",
+ "visit_source_concept_id Int64\n",
+ "death_type_concept_id Int64\n",
+ "cause_source_value category\n",
+ "care_site_id_x Int64\n",
+ "preceding_visit_occurrence_id Int64\n",
+ "race_source_value category\n",
+ "admitting_source_value category\n",
+ "cause_concept_id Int64\n",
+ "visit_source_value category\n",
+ "gender_concept_id Int64\n",
+ "race_concept_id Int64\n",
+ "care_site_id_y Int64\n",
+ "admitting_source_concept_id Int64\n",
+ "birth_datetime category\n",
+ "location_id Int64\n",
+ "race_source_concept_id Int64\n",
+ "visit_type_concept_id Int64\n",
+ "provider_id_x Int64\n",
+ "visit_start_datetime category\n",
+ "visit_concept_id Int64\n",
+ "cause_source_concept_id Int64\n",
+ "visit_end_datetime category\n",
+ "day_of_birth Int64\n",
+ "visit_end_date category\n",
+ "death_datetime category\n",
+ "ethnicity_source_value category\n",
+ "ethnicity_concept_id Int64\n",
+ "person_source_value category\n",
+ "month_of_birth Int64\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "adata.obs.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "adata1 = adata.copy()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Trying to extarct the following features: [ 40762499 ] \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Trying to extarct the following features: \u001b[1m[\u001b[0m\u001b[1;36m40762499\u001b[0m\u001b[1m]\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature O2 saturation pulseoxymetry , feature ID 40762499 in concept table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mO2 saturation pulseoxymetry\u001b[0m, feature ID \u001b[1;32m40762499\u001b[0m in concept table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/opt/anaconda3/envs/ehrapy_latents/lib/python3.9/site-packages/anndata/utils.py:334: ExperimentalFeatureWarning: Support for Awkward Arrays is currently experimental. Behavior may change in the future. Please report any issues you may encounter!\n",
+ " warnings.warn(msg, category, stacklevel=stacklevel)\n"
+ ]
+ }
+ ],
+ "source": [
+ "adata = omop.extract_features(adata, source='measurement', features=[40762499], map_concept=True, add_aggregation_to_X=False, verbose=True, remove_empty_column=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "adata = omop.extract_features(adata, source='observation', features=[2000030003], map_concept=True, add_aggregation_to_X=False, verbose=True, remove_empty_column=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Feature Extraction"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature O2 saturation pulseoxymetry , feature ID 2000030003 in concept table, feature ID 40762499 in \n",
+ "concept relationship table, match socre = 1 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mO2 saturation pulseoxymetry\u001b[0m, feature ID \u001b[1;32m2000030003\u001b[0m in concept table, feature ID \u001b[1;32m40762499\u001b[0m in \n",
+ "concept relationship table, match socre = \u001b[1;32m1\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "extracting features\n",
+ "\u001b[1;35m2024-01-02 17:00:32,078\u001b[0m - \u001b[1;34mroot\u001b[0m \u001b[1;30mINFO - Added `['O2 saturation pulseoxymetry_min', 'O2 saturation pulseoxymetry_max', 'O2 saturation pulseoxymetry_mean']` columns to `obs`.\u001b[0m\n",
+ "\u001b[1;35m2024-01-02 17:00:32,080\u001b[0m - \u001b[1;34mroot\u001b[0m \u001b[1;30mINFO - Added `['O2 saturation pulseoxymetry_min', 'O2 saturation pulseoxymetry_max', 'O2 saturation pulseoxymetry_mean']` features to `X`.\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Extract features from tables\n",
+ "# Different sources are supported: measurement, observation\n",
+ "\n",
+ "# Could use feature ID \n",
+ "adata = omop.extract_features(adata, source='measurement', features=[2000030003], map_concept=True, add_aggregation_to_X=True, add_all_data=True, verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "Detected: feature Base Excess|Blood|Blood Gas , feature ID 2000001002 in concept table, feature ID 3012501 in \n",
+ "concept relationship table, match socre = 1.0 . \n",
+ " \n"
+ ],
+ "text/plain": [
+ "Detected: feature \u001b[32mBase Excess|Blood|Blood Gas\u001b[0m, feature ID \u001b[1;32m2000001002\u001b[0m in concept table, feature ID \u001b[1;32m3012501\u001b[0m in \n",
+ "concept relationship table, match socre = \u001b[1;32m1.0\u001b[0m\u001b[32m.\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "extracting features\n",
+ "\u001b[1;35m2024-01-02 17:00:35,969\u001b[0m - \u001b[1;34mroot\u001b[0m \u001b[1;30mINFO - Added `['Base Excess|Blood|Blood Gas_min', 'Base Excess|Blood|Blood Gas_max', 'Base Excess|Blood|Blood Gas_mean']` columns to `obs`.\u001b[0m\n",
+ "\u001b[1;35m2024-01-02 17:00:35,970\u001b[0m - \u001b[1;34mroot\u001b[0m \u001b[1;30mINFO - Added `['Base Excess|Blood|Blood Gas_min', 'Base Excess|Blood|Blood Gas_max', 'Base Excess|Blood|Blood Gas_mean']` features to `X`.\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Could use feature name\n",
+ "adata = omop.extract_features(adata, source='measurement', features=[\"Base Excess|Blood|Blood Gas\"], map_concept=True, add_aggregation_to_X=True, add_all_data=True, verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Unit \n",
+ " domain_id \n",
+ " concept_class_id \n",
+ " concept_code \n",
+ " \n",
+ " \n",
+ " \n",
+ " \n",
+ " year_of_birth \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " gender_source_value \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " O2 saturation pulseoxymetry_min \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " O2 saturation pulseoxymetry_max \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " O2 saturation pulseoxymetry_mean \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " NaN \n",
+ " \n",
+ " \n",
+ " Base Excess|Blood|Blood Gas_min \n",
+ " mEq/L \n",
+ " Measurement \n",
+ " Lab Test \n",
+ " 50802 \n",
+ " \n",
+ " \n",
+ " Base Excess|Blood|Blood Gas_max \n",
+ " mEq/L \n",
+ " Measurement \n",
+ " Lab Test \n",
+ " 50802 \n",
+ " \n",
+ " \n",
+ " Base Excess|Blood|Blood Gas_mean \n",
+ " mEq/L \n",
+ " Measurement \n",
+ " Lab Test \n",
+ " 50802 \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Unit domain_id concept_class_id \\\n",
+ "year_of_birth NaN NaN NaN \n",
+ "gender_source_value NaN NaN NaN \n",
+ "O2 saturation pulseoxymetry_min NaN NaN NaN \n",
+ "O2 saturation pulseoxymetry_max NaN NaN NaN \n",
+ "O2 saturation pulseoxymetry_mean NaN NaN NaN \n",
+ "Base Excess|Blood|Blood Gas_min mEq/L Measurement Lab Test \n",
+ "Base Excess|Blood|Blood Gas_max mEq/L Measurement Lab Test \n",
+ "Base Excess|Blood|Blood Gas_mean mEq/L Measurement Lab Test \n",
+ "\n",
+ " concept_code \n",
+ "year_of_birth NaN \n",
+ "gender_source_value NaN \n",
+ "O2 saturation pulseoxymetry_min NaN \n",
+ "O2 saturation pulseoxymetry_max NaN \n",
+ "O2 saturation pulseoxymetry_mean NaN \n",
+ "Base Excess|Blood|Blood Gas_min 50802 \n",
+ "Base Excess|Blood|Blood Gas_max 50802 \n",
+ "Base Excess|Blood|Blood Gas_mean 50802 "
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "adata.var"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "AnnData object with n_obs × n_vars = 852 × 8\n",
+ " obs: 'death_datetime', 'visit_end_datetime', 'visit_start_datetime'\n",
+ " var: 'Unit', 'domain_id', 'concept_class_id', 'concept_code'\n",
+ " uns: 'numerical_columns', 'non_numerical_columns'\n",
+ " obsm: 'O2 saturation pulseoxymetry', 'Base Excess|Blood|Blood Gas', 'note'"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "adata"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "ehrapy_latents",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/ehrdata.py b/ehrdata.py
index d51d8f3..534839c 100644
--- a/ehrdata.py
+++ b/ehrdata.py
@@ -1,72 +1,100 @@
import awkward as ak
import numpy as np
import pandas as pd
-
-
+import csv
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
import ehrapy as ep
import scanpy as sc
from anndata import AnnData
import mudata as md
from mudata import MuData
-from typing import List, Union
+from typing import List, Union, Literal
import os
import glob
import dask.dataframe as dd
from thefuzz import process
import sys
from rich import print as rprint
+import missingno as msno
+import warnings
+import numbers
+
clinical_tables_columns = {
- 'person': ['person_id', 'year_of_birth', 'gender_source_value'],
- 'observation_period': [],
- 'death': ['person_id', 'death_datetime'],
- 'visit_occurrence': ['visit_occurrence_id', 'person_id', 'visit_start_datetime', 'visit_end_datetime'],
- 'visit_detail': [],
- 'condition_occurrence': [],
- 'drug_exposure': ['drug_exposure_id', 'person_id', 'visit_occurrence_id', 'drug_concept_id', ],
- 'procedure_occurrence': ['visit_occurrence_id', 'person_id', 'visit_start_datetime', 'visit_end_datetime'],
- 'device_exposure': [],
- 'specimen': [],
- 'measurement': ['measurement_id', 'person_id', 'visit_occurrence_id', 'measurement_concept_id', 'measurement_datetime', 'value_as_number', 'unit_source_value'],
- 'observation': ['observation_id', 'person_id', 'observation_concept_id', 'observation_datetime', "value_as_number", "value_as_string"],
- 'note': [],
- 'note_nlp': [],
- 'fact_relationship': [],
- 'procedure_occurrence': [],
+ "person": ["person_id", "year_of_birth", "gender_source_value"],
+ "observation_period": [],
+ "death": ["person_id", "death_datetime"],
+ "visit_occurrence": ["visit_occurrence_id", "person_id", "visit_start_datetime", "visit_end_datetime"],
+ "visit_detail": [],
+ "condition_occurrence": [],
+ "drug_exposure": [
+ "drug_exposure_id",
+ "person_id",
+ "visit_occurrence_id",
+ "drug_concept_id",
+ ],
+ "procedure_occurrence": ["visit_occurrence_id", "person_id", "visit_start_datetime", "visit_end_datetime"],
+ "device_exposure": [],
+ "specimen": [],
+ "measurement": [
+ "measurement_id",
+ "person_id",
+ "visit_occurrence_id",
+ "measurement_concept_id",
+ "measurement_datetime",
+ "value_as_number",
+ "unit_source_value",
+ ],
+ "observation": [
+ "observation_id",
+ "person_id",
+ "observation_concept_id",
+ "observation_datetime",
+ "value_as_number",
+ "value_as_string",
+ ],
+ "note": [],
+ "note_nlp": [],
+ "fact_relationship": [],
+ "procedure_occurrence": [],
}
health_system_tables_columns = {
- 'location': [],
- 'care_site': ['care_site_id', 'care_site_name'],
- 'provider': [],
+ "location": [],
+ "care_site": ["care_site_id", "care_site_name"],
+ "provider": [],
}
-vocabularies_tables_columns ={
- 'concept': ['concept_id', 'concept_name', 'domain_id', 'vocabulary_id', 'concept_class_id', 'standard_concept', 'concept_code'],
- 'vocabulary': [],
- 'domain': [],
- 'concept_class': [],
- 'concept_synonym': [],
- 'concept_relationship': ["concept_id_1", "concept_id_2", "relationship_id"],
- 'relationship': [],
- 'concept_ancestor': [],
- 'source_to_concept_map': [],
- 'drug_strength': []
+vocabularies_tables_columns = {
+ "concept": [
+ "concept_id",
+ "concept_name",
+ "domain_id",
+ "vocabulary_id",
+ "concept_class_id",
+ "standard_concept",
+ "concept_code",
+ ],
+ "vocabulary": [],
+ "domain": [],
+ "concept_class": [],
+ "concept_synonym": [],
+ "concept_relationship": ["concept_id_1", "concept_id_2", "relationship_id"],
+ "relationship": [],
+ "concept_ancestor": [],
+ "source_to_concept_map": [],
+ "drug_strength": [],
}
-dtypes_dict = {}
-dtypes_dict['concept'] = {'concept_id': int, 'standard_concept': str}
-dtypes_dict['measurement']={'measurement_source_concept_id': int,
- 'measurement_source_value': str,
- 'unit_concept_id': int,
- 'value_as_number': 'float64',
- 'value_source_value': str}
from difflib import SequenceMatcher
from heapq import nlargest as _nlargest
+
def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6):
- """Use SequenceMatcher to return a list of the indexes of the best
- "good enough" matches. word is a sequence for which close matches
+ """Use SequenceMatcher to return a list of the indexes of the best
+ "good enough" matches. word is a sequence for which close matches
are desired (typically a string).
possibilities is a dictionary of sequences.
Optional arg n (default 2) is the maximum number of close matches to
@@ -75,66 +103,139 @@ def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6):
that don't score at least that similar to word are ignored.
"""
- if not n > 0:
+ if not n > 0:
raise ValueError("n must be > 0: %r" % (n,))
if not 0.0 <= cutoff <= 1.0:
raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,))
result = []
s = SequenceMatcher()
s.set_seq2(word)
- for _, (key, x) in enumerate(possibilities.items()):
- s.set_seq1(x)
- if s.real_quick_ratio() >= cutoff and \
- s.quick_ratio() >= cutoff and \
- s.ratio() >= cutoff:
- result.append((s.ratio(), x, key))
+ for _, (key, value) in enumerate(possibilities.items()):
+ s.set_seq1(value)
+ if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff:
+ result.append((s.ratio(), value, key))
# Move the best scorers to head of list
result = _nlargest(n, result)
-
+
# Strip scores for the best n matches
- return [(x, score, key) for score, x, key in result]
+ return [(value, key, score) for score, value, key in result]
+
def df_to_dict(df, key, value):
if isinstance(df, dd.DataFrame):
return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict()
else:
return pd.Series(df[value].values, index=df[key]).to_dict()
-
-class OMOP():
-
+
+def get_column_types(csv_path=None, columns=None):
+ column_types = {}
+ parse_dates = []
+ if csv_path:
+ with open(csv_path, "r") as f:
+ dict_reader = csv.DictReader(f)
+ columns = dict_reader.fieldnames
+ columns_lowercase = [column.lower() for column in columns]
+ for i, column in enumerate(columns_lowercase):
+ if column.endswith(
+ (
+ "source_value",
+ "reason",
+ "measurement_time",
+ "as_string",
+ "title",
+ "text",
+ "name",
+ "concept",
+ "code",
+ "domain_id",
+ "vocabulary_id",
+ "concept_class_id",
+ "relationship_id",
+ "specimen_source_id",
+ "production_id",
+ "unique_device_id",
+ "sig",
+ "lot_number",
+ )
+ ):
+ column_types[columns[i]] = str
+ # TODO quantity in different tables have different types
+ elif column.endswith(("as_number", "low", "high", "quantity")):
+ column_types[columns[i]] = float
+ elif column.endswith("date"):
+ parse_dates.append(columns[i])
+ elif column.endswith("datetime"):
+ parse_dates.append(columns[i])
+ elif column.endswith(("id", "birth", "id_1", "id_2", "refills", "days_supply")):
+ column_types[columns[i]] = "Int64"
+ else:
+ raise KeyError(f"{columns[i]} is not defined in OMOP CDM")
+ if len(parse_dates) == 0:
+ parse_dates = None
+ return column_types, parse_dates
+
+
+class OMOP:
def __init__(self, file_paths):
self.base = file_paths
- file_list = glob.glob(os.path.join(file_paths, '*'))
+ file_list = glob.glob(os.path.join(file_paths, "*"))
self.loaded_tabel = None
self.filepath = {}
for file_path in file_list:
- file_name = file_path.split('/')[-1].removesuffix('.csv')
+ file_name = file_path.split("/")[-1].removesuffix(".csv")
self.filepath[file_name] = file_path
-
+
self.tables = list(self.filepath.keys())
- '''
+ """
if "concept" in self.tables:
df_concept = dd.read_csv(self.filepath["concept"], usecols=vocabularies_tables_columns["concept"])
self.concept_id_to_name = dict(zip(df_concept['id'], df_concept['name']))
self.concept_name_to_id = dict(zip(df_concept['name'], df_concept['id']))
- '''
+ """
@property
def clinical_tables(self):
"""
A dictionary containing all of the ``Clinical`` OMOP CDM tables in the connected database.
"""
- table_names = ['person','observation_period','specimen','death','visit_occurrence','visit_detail','procedure_occurrence','drug_exposure','device_exposure','condition_occurrence','measurement','note','note_nlp','observation','fact_relationship']
+ table_names = [
+ "person",
+ "observation_period",
+ "specimen",
+ "death",
+ "visit_occurrence",
+ "visit_detail",
+ "procedure_occurrence",
+ "drug_exposure",
+ "device_exposure",
+ "condition_occurrence",
+ "measurement",
+ "note",
+ "note_nlp",
+ "observation",
+ "fact_relationship",
+ ]
return [table_name for table_name in self.tables if table_name in table_names]
-
+
@property
def vocabularies_tables(self):
"""
A dictionary containing all of the ``Vocabularies`` OMOP CDM tables in the connected database.
"""
- table_names = ['concept','vocabulary','domain','concept_class','concept_relationship','relationship','concept_synonym','concept_ancestor','source_to_concept_map','drug_strength']
+ table_names = [
+ "concept",
+ "vocabulary",
+ "domain",
+ "concept_class",
+ "concept_relationship",
+ "relationship",
+ "concept_synonym",
+ "concept_ancestor",
+ "source_to_concept_map",
+ "drug_strength",
+ ]
return [table_name for table_name in self.tables if table_name in table_names]
@property
@@ -142,7 +243,7 @@ def metadata_tables(self):
"""
A dictionary containing all of the ``MetaData`` OMOP CDM tables in the connected database.
"""
- table_names = ['cdm_source','metadata']
+ table_names = ["cdm_source", "metadata"]
return [table_name for table_name in self.tables if table_name in table_names]
@property
@@ -150,7 +251,7 @@ def health_system_tables(self):
"""
A dictionary containing all of the ``Health System`` OMOP CDM tables in the connected database.
"""
- table_names = ['location','care_site','provider']
+ table_names = ["location", "care_site", "provider"]
return [table_name for table_name in self.tables if table_name in table_names]
@property
@@ -158,55 +259,52 @@ def derived_elements_tables(self):
"""
A dictionary containing all of the ``Derived Elements`` OMOP CDM tables in the connected database.
"""
- table_names = ['cohort','cohort_definition','drug_era','dose_era','condition_era']
+ table_names = ["cohort", "cohort_definition", "drug_era", "dose_era", "condition_era"]
return [table_name for table_name in self.tables if table_name in table_names]
-
+
@property
def health_economics_tables(self):
"""
A dictionary containing all of the ``Health Economics`` OMOP CDM tables in the connected database.
"""
- table_names = ['payer_plan_period','cost']
+ table_names = ["payer_plan_period", "cost"]
return [table_name for table_name in self.tables if table_name in table_names]
-
-
-
- def load(self, level='stay_level', tables = ['visit_occurrence', 'person', 'death'], add_to_X=None, features=None):
-
- if level=='stay_level':
- index = {'visit_occurrence': 'visit_occurrence_id', 'person': 'person_id', 'death': "person_id"}
+ def load(self, level="stay_level", tables=["visit_occurrence", "person", "death"]):
+ # TODO patient level and hospital level
+ if level == "stay_level":
+ index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"}
# TODO Only support clinical_tables_columns
- for table in tables:
- setattr(self, table, dd.read_csv(self.filepath[table], usecols=clinical_tables_columns[table]).set_index("person_id"))
-
-
- #concept_id_list = list(self.concept.concept_id)
- #concept_name_list = list(self.concept.concept_id)
- #concept_domain_id_list = list(set(self.concept.domain_id))
-
-
-
-
-
- #self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure']
- joined_table = dd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how='left')
- joined_table = dd.merge(joined_table, self.death, left_index=True, right_index=True, how='left')
+ for table in tables:
+ column_types, parse_dates = get_column_types(self.filepath[table])
+ setattr(
+ self,
+ table,
+ dd.read_csv(self.filepath[table], dtype=column_types, parse_dates=parse_dates).set_index(
+ "person_id"
+ ),
+ )
+
+ # concept_id_list = list(self.concept.concept_id)
+ # concept_name_list = list(self.concept.concept_id)
+ # concept_domain_id_list = list(set(self.concept.domain_id))
+
+ # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure']
+ joined_table = dd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left")
+ joined_table = dd.merge(joined_table, self.death, left_index=True, right_index=True, how="left")
joined_table = joined_table.compute()
- joined_table = joined_table.set_index('visit_occurrence_id')
-
-
-
+ joined_table = joined_table.set_index("visit_occurrence_id")
- #obs_only_list = list(self.joined_table.columns)
- #obs_only_list.remove('visit_occurrence_id')
- columns_obs_only = list(set(joined_table.columns) - set(['year_of_birth', 'gender_source_value']))
+ # obs_only_list = list(self.joined_table.columns)
+ # obs_only_list.remove('visit_occurrence_id')
+ columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"]))
adata = ep.ad.df_to_anndata(
- joined_table, index_column="visit_occurrence_id", columns_obs_only = columns_obs_only)
-
- '''
+ joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only
+ )
+
+ """
for column in self.measurement.columns:
if column != 'visit_occurrence_id':
obs_list = []
@@ -227,163 +325,377 @@ def load(self, level='stay_level', tables = ['visit_occurrence', 'person', 'deat
for visit_occurrence_id in adata.obs.index:
obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column]))
adata.obsm[column]= ak.Array(obs_list)
- '''
-
+ """
+
+ return adata
+
+ def feature_statistics(
+ self,
+ source: Literal[
+ "observation",
+ "measurement",
+ "procedure_occurrence",
+ "specimen",
+ "device_exposure",
+ "drug_exposure",
+ "condition_occurrence",
+ ],
+ map_concept=True,
+ number=20,
+ ):
+ column_types, parse_dates = get_column_types(self.filepath[source])
+ df_source = dd.read_csv(
+ self.filepath[source], dtype=column_types, parse_dates=parse_dates
+ ) # , usecols=clinical_tables_columns[source])
+ feature_counts = df_source.compute().value_counts(f"{source}_concept_id")[0:number]
+ feature_counts = feature_counts.to_frame().reset_index(drop=False)
+
+ feature_counts[f"{source}_concept_id_1"], feature_counts[f"{source}_concept_id_2"] = self.map_concept_id(
+ feature_counts[f"{source}_concept_id"], verbose=False
+ )
+ feature_counts["feature_name"] = self.get_concept_name(feature_counts[f"{source}_concept_id_1"])
+ if feature_counts[f"{source}_concept_id_1"].equals(feature_counts[f"{source}_concept_id_2"]):
+ feature_counts.drop(f"{source}_concept_id_2", inplace=True)
+ feature_counts.rename(columns={f"{source}_concept_id_1": f"{source}_concept_id"})
+ feature_counts = feature_counts.reindex(columns=["feature_name", f"{source}_concept_id", "count"])
+ else:
+ feature_counts = feature_counts.reindex(
+ columns=["feature_name", f"{source}_concept_id_1", f"{source}_concept_id_2", "count"]
+ )
+
+ ax = sns.barplot(feature_counts, x="feature_name", y="count")
+ ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
+ plt.tight_layout()
+ return feature_counts
+
+ def map_concept_id(self, concept_id: Union[str, List], verbose=True):
+ column_types, parse_dates = get_column_types(self.filepath["concept_relationship"])
+ df_concept_relationship = dd.read_csv(
+ self.filepath["concept_relationship"], dtype=column_types, parse_dates=parse_dates
+ ).dropna(
+ subset=["concept_id_1", "concept_id_2", "relationship_id"]
+ ) # , usecols=vocabularies_tables_columns["concept_relationship"],
+ concept_relationship_dict = df_to_dict(
+ df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"],
+ key="concept_id_1",
+ value="concept_id_2",
+ )
+ concept_relationship_dict_reverse = df_to_dict(
+ df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Mapped from"],
+ key="concept_id_1",
+ value="concept_id_2",
+ )
+
+ if isinstance(concept_id, numbers.Integral):
+ concept_id = [concept_id]
+
+ concept_id_1 = []
+ concept_id_2 = []
+ concept_id_mapped_not_found = []
+ for id in concept_id:
+ try:
+ concept_id_2.append(concept_relationship_dict[id])
+ concept_id_1.append(id)
+ except KeyError:
+ try:
+ concept_id_1.append(concept_relationship_dict_reverse[id])
+ concept_id_2.append(id)
+ except KeyError:
+ concept_id_1.append(id)
+ concept_id_2.append(id)
+ concept_id_mapped_not_found.append(id)
+ if len(concept_id_mapped_not_found) > 0:
+ # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!")
+ if verbose:
+ rprint(f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!")
+ if len(concept_id_1) == 1:
+ return concept_id_1[0], concept_id_2[0]
+ else:
+ return concept_id_1, concept_id_2
+
+ def get_concept_name(self, concept_id: Union[str, List], raise_error=False, verbose=True):
+ if isinstance(concept_id, numbers.Integral):
+ concept_id = [concept_id]
+
+ column_types, parse_dates = get_column_types(self.filepath["concept"])
+ df_concept = dd.read_csv(self.filepath["concept"], dtype=column_types, parse_dates=parse_dates).dropna(
+ subset=["concept_id", "concept_name"]
+ ) # usecols=vocabularies_tables_columns["concept"]
+ concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name")
+ concept_name = []
+ concept_name_not_found = []
+ for id in concept_id:
+ try:
+ concept_name.append(concept_dict[id])
+ except KeyError:
+ concept_name.append(id)
+ concept_name_not_found.append(id)
+ if len(concept_name_not_found) > 0:
+ # warnings.warn(f"Couldn't find concept {id} in concept table!")
+ if verbose:
+ rprint(f"Couldn't find concept {concept_name_not_found} in concept table!")
+ if raise_error:
+ raise KeyError
+ if len(concept_name) == 1:
+ return concept_name[0]
+ else:
+ return concept_name
+
+ def extract_note(self, adata, source="note"):
+ column_types, parse_dates = get_column_types(self.filepath[source])
+ df_source = dd.read_csv(self.filepath[source], dtype=column_types, parse_dates=parse_dates)
+ if columns is None:
+ columns = df_source.columns
+ obs_dict = [
+ {
+ column: list(df_source[df_source["visit_occurrence_id"] == int(visit_occurrence_id)][column])
+ for column in columns
+ }
+ for visit_occurrence_id in adata.obs.index
+ ]
+ adata.obsm["note"] = ak.Array(obs_dict)
return adata
- def extract_features(self,
- adata,
- source: str,
- features: str or int or List[Union[str, int]],
- map_concept = True,
- add_aggregation_to_X: bool=True,
- aggregation_methods = None,
- add_all_data: bool = True,
- exact_match: bool = True,
- verbose: bool = False,):
- #source = 'measurement'
- #features = [3012501]
- #add_aggregation_to_X = True
-
+
+ def note_nlp_map(
+ self,
+ ):
+ # Got some inspirations from: https://github.com/aws-samples/amazon-comprehend-medical-omop-notes-mapping
+ pass
+
+ def extract_features(
+ self,
+ adata,
+ source: Literal[
+ "observation",
+ "measurement",
+ "procedure_occurrence",
+ "specimen",
+ "device_exposure",
+ "drug_exposure",
+ "condition_occurrence",
+ ],
+ features: str or int or List[Union[str, int]] = None,
+ key: str = None,
+ columns_in_source_table: str or List[str] = None,
+ map_concept=True,
+ add_aggregation_to_X: bool = True,
+ aggregation_methods=None,
+ add_all_data: bool = True,
+ exact_match: bool = True,
+ remove_empty_column: bool = True,
+ ignore_not_shown_in_concept_table: bool = True,
+ verbose: bool = False,
+ ):
+ if key is None:
+ if source in ["measurement", "observation", "specimen"]:
+ key = f"{source}_concept_id"
+ elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]:
+ key = f"{source.split('_')[0]}_concept_id"
+ else:
+ raise KeyError(f"Extracting data from {source} is not supported yet")
+ """
if source == 'measurement':
columns = ["value_as_number", "measurement_datetime"]
elif source == 'observation':
columns = ["value_as_number", "value_as_string", "measurement_datetime"]
+ elif source == 'condition_occurrence':
+ columns = None
else:
raise KeyError(f"Extracting data from {source} is not supported yet")
-
-
+ """
+
# TODO load using Dask or Dask-Awkward
# Load source table using dask
- df_source = dd.read_csv(self.filepath[source], usecols=clinical_tables_columns[source], dtype=dtypes_dict["measurement"])
-
- if "concept" in self.tables:
- df_concept = dd.read_csv(self.filepath["concept"], usecols=vocabularies_tables_columns["concept"], dtype = dtypes_dict["concept"]).dropna(subset=['concept_id', 'concept_name'])
- concept_dict = df_to_dict(df=df_concept, key = 'concept_id', value = 'concept_name')
- if map_concept:
- df_concept_relationship = dd.read_csv(self.filepath["concept_relationship"], usecols=vocabularies_tables_columns["concept_relationship"]).dropna(subset=['concept_id_1', 'concept_id_2', 'relationship_id'])
- concept_relationship_dict = df_to_dict(df=df_concept_relationship[df_concept_relationship['relationship_id'] == 'Maps to'], key = 'concept_id_1', value = 'concept_id_2')
- map_concept_id_list = []
- # Input could be feature names/feature id (concept id)
- # TODO support features name
+ column_types, parse_dates = get_column_types(self.filepath[source])
+ if len(parse_dates) == 1:
+ columns = list(column_types.keys()) + [parse_dates]
+ else:
+ columns = list(column_types.keys()) + parse_dates
+ df_source = dd.read_csv(
+ self.filepath[source], dtype=column_types, parse_dates=parse_dates
+ ) # , usecols=clinical_tables_columns[source]
+
if not features:
- raise KeyError(f"Please input the desired features you want to extarct")
+ warnings.warn(
+ "Please specify desired features you want to extract. Otherwise, it will try to extract all the features!"
+ )
+ features = list(df_source[key].compute().unique())
else:
- if isinstance(features, int) or isinstance(features, str):
- features = [features]
-
- # TODO query this in the table
-
- #concept_name = 'Base Excess|Blood|Blood Gas'
- #unit = 'mEq/L'
- #domain_id = 'Measurement'
- feature_id_list = []
- feature_name_list = []
- domain_id_list = []
- concept_class_id_list = []
- concept_code_list = []
- # Get feature id for each input, and check if each feature occurs in the concept table
- for feature in features:
- if isinstance(feature, int):
- try:
- feature_id = feature
- feature_name = concept_dict[feature_id]
- feature_id_list.append(feature_id)
- match_score = 1
- except KeyError:
- rprint(f"Feature ID - [red]{feature}[/] could not be found in concept table")
+ rprint(f"Trying to extarct the following features: {features}")
+
+ # Input could be feature names/feature id (concept id)
+ # First convert all input feaure names into feature id. Map concept using CONCEPT_RELATIONSHIP table if required.
+ # Then try to extract feature data from source table using feature id.
+
+ # TODO support features name
+
+ if "concept" in self.tables:
+ column_types, parse_dates = get_column_types(self.filepath["concept"])
+ df_concept = dd.read_csv(self.filepath["concept"], dtype=column_types, parse_dates=parse_dates).dropna(
+ subset=["concept_id", "concept_name"]
+ ) # usecols=vocabularies_tables_columns["concept"],
+ concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name")
+
+ # TODO query this in the table
+
+ feature_id_list = []
+ feature_name_list = []
+ domain_id_list = []
+ concept_class_id_list = []
+ concept_code_list = []
+
+ fetures_not_shown_in_concept_table = []
+
+ # Get feature id for each input, and check if each feature occurs in the concept table
+ for feature in features:
+ # if the input is feature ID
+ if isinstance(feature, numbers.Integral):
+ feature_id = feature
+ feature_id_1, feature_id_2 = self.map_concept_id(feature_id, verbose=False)
+ try:
+ feature_name = self.get_concept_name(feature_id_1, raise_error=True, verbose=False)
+ except KeyError:
+ if ignore_not_shown_in_concept_table:
+ fetures_not_shown_in_concept_table.append(feature)
+ continue
+ else:
+ rprint(f"Feature ID - [red]{feature_id_1}[/] could not be found in concept table")
raise
- elif isinstance(feature, str):
-
- result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2)
- if len(result) == 2:
- match_score = result[0][1]
-
- if match_score != 1:
- if exact_match:
- rprint(f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similar ones: 1) [red]{result[0][0]}[/] 2) [red]{result[1][0]}")
- raise ValueError
- else:
- if result[1][1] == 1:
- rprint(f"Found multiple exact matches for [red]{feature}[/] in the concept table: 1) concept id: [red]{result[0][2]}[/] 2) concept id: [red]{result[1][2]}[/]. It is better to specify concept id directly.")
- raise ValueError
- feature_name = feature
- feature_id = result[0][2]
-
+ match_score = 1
+
+ # if the input is feature name
+ elif isinstance(feature, str):
+ # return a list of (value, key, score)
+ result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2)
+
+ # if find 2 best matches
+ if len(result) == 2:
+ match_score = result[0][2]
+
+ if match_score != 1:
+ if exact_match:
+ rprint(
+ f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similar ones: 1) [red]{result[0][0]}[/] 2) [red]{result[1][0]}"
+ )
+ raise ValueError
else:
- feature_name = result[0][0]
- match_score = result[0][1]
- feature_id = result[0][2]
- if exact_match and match_score != 1:
- rprint(f"Unable to find an exact match for [red]{feature}[/] in the concept table Similar one is [red]{result[0][0]}")
+ if result[1][1] == 1:
+ rprint(
+ f"Found multiple exact matches for [red]{feature}[/] in the concept table: 1) concept id: [red]{result[0][1]}[/] 2) concept id: [red]{result[1][1]}[/]. It is better to specify concept id directly."
+ )
raise ValueError
- feature_id_list.append(feature_id)
+ feature_name = feature
+ feature_id = result[0][1]
+ # if only find 1 match
else:
- rprint("Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] you want to extarct")
- raise TypeError
- if map_concept:
- concept_id = concept_relationship_dict[feature_id]
- map_concept_id_list.append(concept_id)
-
- feature_name_list.append(feature_name)
- domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0])
- concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0])
- concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0])
- if verbose:
- if map_concept:
+ feature_name = result[0][0]
+ match_score = result[0][1]
+ feature_id = result[0][2]
+ if exact_match and match_score != 1:
rprint(
- f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, feature ID [green]{concept_id}[/] in concept relationship table, match socre = [green]{match_score}."
+ f"Unable to find an exact match for [red]{feature}[/] in the concept table Similar one is [red]{result[0][0]}"
)
- else:
- rprint(
- f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match socre = [green]{match_score}."
+ raise ValueError
+ feature_id_1, feature_id_2 = self.map_concept_id(feature_id)
+
+ else:
+ rprint(
+ "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] you want to extarct"
+ )
+ raise TypeError
+
+ # feature_name_list.append(feature_name)
+ # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0])
+ # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0])
+ # concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0])
+
+ if verbose:
+ """
+ if map_concept:
+ rprint(
+ f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, feature ID [green]{concept_id}[/] in concept relationship table, match socre = [green]{match_score}."
+ )
+ else:
+ """
+ rprint(
+ f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match socre = [green]{match_score}."
+ )
+
+ # for feature_id, feature_name, domain_id, concept_class_id, concept_code in zip(feature_id_list, feature_name_list, domain_id_list, concept_class_id_list, concept_code_list):
+ try:
+ feature_df = df_source[df_source[key] == feature_id_2].compute()
+ except:
+ print(f"Features ID could not be found in {source} table")
+ # TODO add checks if all columns exist in source table
+ if columns_in_source_table:
+ columns = columns_in_source_table
+
+ if remove_empty_column:
+ columns = [column for column in columns if not feature_df[column].isna().all()]
+
+ if len(feature_df) > 0:
+ obs_dict = [
+ {
+ column: list(feature_df[feature_df["visit_occurrence_id"] == int(visit_occurrence_id)][column])
+ for column in columns
+ }
+ for visit_occurrence_id in adata.obs.index
+ ]
+ adata.obsm[feature_name] = ak.Array(obs_dict)
+
+ if add_aggregation_to_X:
+ unit = feature_df["unit_source_value"].value_counts().index[0]
+ if aggregation_methods is None:
+ aggregation_methods = ["min", "max", "mean"]
+ var_name_list = [
+ f"{feature_name}_{aggregation_method}" for aggregation_method in aggregation_methods
+ ]
+ for aggregation_method in aggregation_methods:
+ func = getattr(ak, aggregation_method)
+ adata.obs[f"{feature_name}_{aggregation_method}"] = list(
+ func(adata.obsm[feature_name]["value_as_number"], axis=1)
)
-
- if map_concept:
- feature_id_list = map_concept_id_list
- for feature_id, feature_name, domain_id, concept_class_id, concept_code in zip(feature_id_list, feature_name_list, domain_id_list, concept_class_id_list, concept_code_list):
- try:
- feature_df = df_source[df_source[f"{source}_concept_id"] == feature_id].compute()
- except:
- print(f"Features ID could not be found in {source} table")
-
- if len(feature_df) > 0:
- print("extracting features")
- obs_dict = [{column: list(feature_df[feature_df['visit_occurrence_id'] == int(visit_occurrence_id)][column]) for column in columns} for visit_occurrence_id in adata.obs.index]
- adata.obsm[feature_name] = ak.Array(obs_dict)
-
- if add_aggregation_to_X:
- unit = feature_df['unit_source_value'].value_counts().index[0]
- if aggregation_methods is None:
- aggregation_methods = ['min', 'max', 'mean']
- var_name_list = [f'{feature_name}_{aggregation_method}' for aggregation_method in aggregation_methods]
- for aggregation_method in aggregation_methods:
- func = getattr(ak, aggregation_method)
- adata.obs[f'{feature_name}_{aggregation_method}'] = list(func(adata.obsm[feature_name]['value_as_number'], axis=1))
- adata = ep.ad.move_to_x(adata, var_name_list)
- adata.var.loc[var_name_list, 'Unit'] = unit
- adata.var.loc[var_name_list,'domain_id'] = domain_id
- adata.var.loc[var_name_list,'concept_class_id'] = concept_class_id
- adata.var.loc[var_name_list,'concept_code'] = concept_code
+ adata = ep.ad.move_to_x(adata, var_name_list)
+ adata.var.loc[var_name_list, "Unit"] = unit
+ adata.var.loc[var_name_list, "domain_id"] = domain_id
+ adata.var.loc[var_name_list, "concept_class_id"] = concept_class_id
+ adata.var.loc[var_name_list, "concept_code"] = concept_code
+ if len(fetures_not_shown_in_concept_table) > 0:
+ rprint(f"Couldn't find concept {fetures_not_shown_in_concept_table} in concept table!")
return adata
- # More IO functions
- def to_dataframe(self, adata, feature, patient, visit):
- df = ak.to_dataframe(adata.obsm['Base Excess|Blood|Blood Gas'])
-
+ # More IO functions
+ def to_dataframe(
+ self,
+ adata,
+ feature: str, # TODO also support list of features
+ # patient str or List, # TODO also support subset of patients/visit
+ ):
# TODO
# join index (visit_occurrence_id) to df
# can be viewed as patient level - only select some patient
-
+
+ df = ak.to_dataframe(adata.obsm[feature])
+
+ df.reset_index(drop=False, inplace=True)
+ df["entry"] = adata.obs.index[df["entry"]]
+ df = df.rename(columns={"entry": "visit_occurrence_id"})
+ del df["subentry"]
+ return df
# More Plot functions
- def plot_timeseries(self,):
+ def plot_timeseries(
+ self,
+ ):
# add one function from previous pipeline
pass
-
+
# More Pre-processing functions
- def sampling(self,):
+ def sampling(
+ self,
+ ):
# function from dask
- # need to check dask-awkward again
+ # need to check dask-awkward again
pass
diff --git a/src/ehrdata/ehrdata.py b/src/ehrdata/ehrdata.py
deleted file mode 100644
index b0e3045..0000000
--- a/src/ehrdata/ehrdata.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import awkward as ak
-import numpy as np
-import pandas as pd
-
-
-import ehrapy as ep
-import scanpy as sc
-from anndata import AnnData
-import mudata as md
-from mudata import MuData
-import os
-import glob
-
-class OMOP():
-
- def __init__(self, file_paths):
- self.base = file_paths
- file_list = glob.glob(os.path.join(file_paths, '*'))
- self.loaded_tabel = None
- self.tables = []
- for file_path in file_list:
- file_name = file_path.split('/')[-1].removesuffix('.csv')
- self.tables.append(file_name)
-
-
- @property
- def clinical_tables(self):
- """
- A dictionary containing all of the ``Clinical`` OMOP CDM tables in the connected database.
- """
- table_names = ['person','observation_period','specimen','death','visit_occurrence','visit_detail','procedure_occurrence','drug_exposure','device_exposure','condition_occurrence','measurement','note','note_nlp','observation','fact_relationship']
- return [table_name for table_name in self.tables if table_name in table_names]
-
- @property
- def vocabularies_tables(self):
- """
- A dictionary containing all of the ``Vocabularies`` OMOP CDM tables in the connected database.
- """
- table_names = ['concept','vocabulary','domain','concept_class','concept_relationship','relationship','concept_synonym','concept_ancestor','source_to_concept_map','drug_strength']
- return [table_name for table_name in self.tables if table_name in table_names]
-
- @property
- def metadata_tables(self):
- """
- A dictionary containing all of the ``MetaData`` OMOP CDM tables in the connected database.
- """
- table_names = ['cdm_source','metadata']
- return [table_name for table_name in self.tables if table_name in table_names]
-
- @property
- def health_system_tables(self):
- """
- A dictionary containing all of the ``Health System`` OMOP CDM tables in the connected database.
- """
- table_names = ['location','care_site','provider']
- return [table_name for table_name in self.tables if table_name in table_names]
-
- @property
- def derived_elements_tables(self):
- """
- A dictionary containing all of the ``Derived Elements`` OMOP CDM tables in the connected database.
- """
- table_names = ['cohort','cohort_definition','drug_era','dose_era','condition_era']
- return [table_name for table_name in self.tables if table_name in table_names]
-
- @property
- def health_economics_tables(self):
- """
- A dictionary containing all of the ``Health Economics`` OMOP CDM tables in the connected database.
- """
- table_names = ['payer_plan_period','cost']
- return [table_name for table_name in self.tables if table_name in table_names]
-
-
- def load(self, level='stay_level', add_to_X=None, features=None):
- if level == 'stay_level':
- self.visit_occurrence = pd.read_csv(f'{self.base}/visit_occurrence.csv')
- self.person = pd.read_csv(f'{self.base}/person.csv', index_col='person_id')
- self.death = pd.read_csv(f'{self.base}/death.csv', index_col='person_id')
- self.measurement = pd.read_csv(f'{self.base}/measurement.csv')
- self.observation = pd.read_csv(f'{self.base}/observation.csv')
- self.drug_exposure = pd.read_csv(f'{self.base}/drug_exposure.csv')
-
- self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure']
- self.joined_table = pd.merge(self.visit_occurrence, self.person, on='person_id', how='left')
- self.joined_table = pd.merge(self.joined_table, self.death, on='person_id', how='left')
-
-
-
- obs_only_list = list(self.joined_table.columns)
- obs_only_list.remove('visit_occurrence_id')
- adata = ep.ad.df_to_anndata(
- self.joined_table, index_column="visit_occurrence_id", columns_obs_only = obs_only_list)
-
-
- for column in self.measurement.columns:
- if column != 'visit_occurrence_id':
- obs_list = []
- for visit_occurrence_id in adata.obs.index:
- obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column]))
- adata.obsm[column]= ak.Array(obs_list)
-
- for column in self.drug_exposure.columns:
- if column != 'visit_occurrence_id':
- obs_list = []
- for visit_occurrence_id in adata.obs.index:
- obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column]))
- adata.obsm[column]= ak.Array(obs_list)
-
- for column in self.observation.columns:
- if column != 'visit_occurrence_id':
- obs_list = []
- for visit_occurrence_id in adata.obs.index:
- obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column]))
- adata.obsm[column]= ak.Array(obs_list)
-
-
- return adata