From efc36937a363e1c8405caf70d3e7e1d5799e765c Mon Sep 17 00:00:00 2001 From: orisenbazuru Date: Sun, 12 Jan 2020 18:22:24 +0100 Subject: [PATCH] added neural workflow for DDI model --- .gitignore | 52 + cluster/data/medinfmk/ddi/raw/drug_names.tsv | 1430 +++++++++++++++++ ddi/__init__.py | 0 ddi/dataset.py | 213 +++ ddi/model.py | 52 + ddi/run_workflow.py | 353 ++++ ddi/utilities.py | 330 ++++ .../02_AA_Skorch_DDI-checkpoint.ipynb | 581 ------- notebooks/02_AA_Skorch_DDI.ipynb | 581 ------- req.txt | 2 +- req_conda.txt | 197 +++ req_pip.txt | 25 + setup.py | 16 + 13 files changed, 2669 insertions(+), 1163 deletions(-) create mode 100644 cluster/data/medinfmk/ddi/raw/drug_names.tsv create mode 100644 ddi/__init__.py create mode 100644 ddi/dataset.py create mode 100644 ddi/model.py create mode 100644 ddi/run_workflow.py create mode 100644 ddi/utilities.py delete mode 100644 notebooks/.ipynb_checkpoints/02_AA_Skorch_DDI-checkpoint.ipynb delete mode 100644 notebooks/02_AA_Skorch_DDI.ipynb create mode 100644 req_conda.txt create mode 100644 req_pip.txt create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 4f5f323..913d836 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,54 @@ *.p *.pickle + +.DS_Store + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Vs code +.vscode + +# orisenbazuru +explore.py +notebooks/orisenbazuru/* +cluster/data/medinfmk/ddi/processed/* \ No newline at end of file diff --git a/cluster/data/medinfmk/ddi/raw/drug_names.tsv b/cluster/data/medinfmk/ddi/raw/drug_names.tsv new file mode 100644 index 0000000..9f3cca2 --- /dev/null +++ b/cluster/data/medinfmk/ddi/raw/drug_names.tsv @@ -0,0 +1,1430 @@ +CID100000085 carnitine +CID100000119 gamma-aminobutyric +CID100000137 5-aminolevulinic +CID100000143 leucovorin +CID100000146 5-methyltetrahydrofolate +CID100000158 PGE2 +CID100000159 prostacyclin +CID100000160 prostaglandin +CID100000175 acetate +CID100000187 acetylcholine +CID100000191 adenosine +CID100000206 glucose +CID100000214 PGE1 +CID100000222 ammonia +CID100000232 arginine +CID100000244 benzyl +CID100000247 betaine +CID100000271 calcium +CID100000297 graphene +CID100000298 chloramphenicol +CID100000303 bile +CID100000305 choline +CID100000311 citric +CID100000312 chloride +CID100000338 salicylate +CID100000401 D-cycloserine +CID100000444 bupropion +CID100000450 estradiol +CID100000453 mannitol +CID100000564 EACA +CID100000581 N-acetylcysteine +CID100000596 cytarabine +CID100000598 mesna +CID100000612 lactate +CID100000679 DMSO +CID100000681 dopamine +CID100000698 estrone +CID100000699 Oestrogen +CID100000700 monoethanolamine +CID100000727 lindane +CID100000738 glutamine +CID100000750 glycine +CID100000753 glycerol +CID100000767 bicarbonate +CID100000772 LMWH +CID100000774 histamine +CID100000785 quinol +CID100000813 potassium +CID100000815 kanamycin +CID100000838 epinephrine +CID100000853 thyroxine +CID100000861 triiodothyronine +CID100000888 magnesium +CID100000896 melatonin +CID100000923 sodium +CID100000937 niacin +CID100000942 nicotine +CID100000946 nitrite +CID100000948 nitrous +CID100000951 norepinephrine +CID100000961 hydroxyl +CID100001003 phosphate +CID100001046 pyrazinamide +CID100001054 pyridoxine +CID100001065 quinidine +CID100001071 retinol +CID100001084 thiosulfate +CID100001125 tetrahydrobiopterin +CID100001130 thiamine +CID100001134 thymidine +CID100001206 methamphetamine +CID100001301 naproxen +CID100001546 cladribine +CID100001690 doxorubicin +CID100001727 4-AP +CID100001775 phenytoin +CID100001798 methazolamide +CID100001805 5-azacytidine +CID100001875 methylprednisolone +CID100001971 abacavir +CID100001972 amphotericin +CID100001978 acebutolol +CID100001983 acetaminophen +CID100001986 acetazolamide +CID100001990 acetohydroxamic +CID100001993 methacholine +CID100002019 actinomycin +CID100002021 spectinomycin +CID100002022 acyclovir +CID100002082 albendazole +CID100002083 salbutamol +CID100002085 Almeta +CID100002088 alendronate +CID100002092 alfuzosin +CID100002094 allopurinol +CID100002099 alosetron +CID100002118 alprazolam +CID100002123 hexamethylmelamine +CID100002130 amantadine +CID100002133 amcinonide +CID100002140 diatrizoate +CID100002141 amifostine +CID100002142 amikacin +CID100002145 aminoglutethimide +CID100002148 p-aminohippurate +CID100002153 theophylline +CID100002156 amiodarone +CID100002159 amisulpride +CID100002160 amitriptyline +CID100002161 amlexanox +CID100002162 amlodipine +CID100002163 amobarbital +CID100002168 amorolfine +CID100002170 amoxapine +CID100002171 amoxicillin +CID100002173 ampicillin +CID100002177 amprenavir +CID100002179 amsacrine +CID100002182 anagrelide +CID100002187 anastrozole +CID100002215 apomorphine +CID100002216 apraclonidine +CID100002232 argatroban +CID100002244 aspirin +CID100002249 atenolol +CID100002250 atorvastatin +CID100002265 azathioprine +CID100002266 azelaic +CID100002267 azelastine +CID100002269 azithromycin +CID100002274 aztreonam +CID100002284 baclofen +CID100002307 beclomethasone +CID100002308 beclomethasone +CID100002311 benazepril +CID100002315 bendrofluazide +CID100002337 benzocaine +CID100002341 benzphetamine +CID100002344 benztropine +CID100002345 benzyl +CID100002349 penicillin +CID100002350 bepotastine +CID100002351 bepridil +CID100002366 betahistine +CID100002367 dexamethasone +CID100002369 betaxolol +CID100002370 bethanechol +CID100002375 bicalutamide +CID100002391 bisacodyl +CID100002405 bisoprolol +CID100002431 bretylium +CID100002435 brimonidine +CID100002441 bromazepam +CID100002442 bromhexine +CID100002443 bromocriptine +CID100002462 budesonide +CID100002471 bumetanide +CID100002474 bupivacaine +CID100002476 buprenorphine +CID100002477 buspirone +CID100002478 busulfan +CID100002479 butabarbital +CID100002484 butenafine +CID100002487 butorphanol +CID100002512 cabergoline +CID100002519 caffeine +CID100002520 verapamil +CID100002522 calcipotriol +CID100002524 1,25(OH)2D3 +CID100002540 candesartan +CID100002541 candesartan +CID100002547 capreomycin +CID100002548 capsaicin +CID100002550 captopril +CID100002551 carbachol +CID100002554 carbamazepine +CID100002559 carbenicillin +CID100002563 carbidopa +CID100002564 carbinoxamine +CID100002576 carisoprodol +CID100002578 BCNU +CID100002583 carteolol +CID100002585 carvedilol +CID100002609 cefaclor +CID100002610 cefadroxil +CID100002617 cefazolin +CID100002622 cefepime +CID100002631 cefotaxime +CID100002637 cefoxitin +CID100002646 cefprozil +CID100002650 ceftazidime +CID100002654 cephem +CID100002655 ceftizoxime +CID100002656 ceftriaxone +CID100002658 cefuroxime +CID100002662 celecoxib +CID100002663 celiprolol +CID100002666 cephalexin +CID100002675 cefixime +CID100002676 cerivastatin +CID100002678 cetirizine +CID100002684 cevimeline +CID100002708 chlorambucil +CID100002712 chlordiazepoxide +CID100002713 chlorhexidine +CID100002719 chloroquine +CID100002720 thiazide +CID100002725 chlorpheniramine +CID100002726 chlorpromazine +CID100002727 chlorpropamide +CID100002732 chlorthalidone +CID100002733 chlorzoxazone +CID100002749 ciclopirox +CID100002751 cilazapril +CID100002754 cilostazol +CID100002756 cimetidine +CID100002762 cinoxacin +CID100002764 ciprofloxacin +CID100002767 cisplatin +CID100002771 citalopram +CID100002781 clemastine +CID100002786 clindamycin +CID100002789 clobazam +CID100002791 clobetasol +CID100002792 clobetasone +CID100002794 clofazimine +CID100002800 clomiphene +CID100002801 clomipramine +CID100002802 clonazepam +CID100002803 clonidine +CID100002806 clopidogrel +CID100002809 clorazepate +CID100002812 clotrimazole +CID100002818 clozapine +CID100002826 cocaine +CID100002828 codeine +CID100002833 colchicine +CID100002881 sodium +CID100002883 crotamiton +CID100002891 cyanocobalamin +CID100002895 cyclobenzaprine +CID100002905 cyclopentolate +CID100002907 cyclophosphamide +CID100002909 v +CID100002913 cyproheptadine +CID100002914 cyproterone +CID100002949 danazol +CID100002951 dantrolene +CID100002955 dapsone +CID100002958 anthracycline +CID100002972 deferiprone +CID100002973 deferoxamine +CID100002978 delta +CID100002995 desipramine +CID100003000 desoximetasone +CID100003003 dexamethasone +CID100003007 amphetamine +CID100003008 dextromethorphan +CID100003009 DFMO +CID100003015 cyproterone +CID100003016 diazepam +CID100003019 diazoxide +CID100003032 diclofenac +CID100003038 dichlorphenamide +CID100003040 dicloxacillin +CID100003042 dicyclomine +CID100003043 didanosine +CID100003053 diethylenetriaminepentaacetic +CID100003056 diflorasone +CID100003059 diflunisal +CID100003060 epitopic +CID100003062 digoxin +CID100003063 dihydrocodeine +CID100003066 dihydroergotamine +CID100003075 diltiazem +CID100003080 dimercaprol +CID100003100 diphenhydramine +CID100003105 dipivefrin +CID100003108 dipyridamole +CID100003114 disopyramide +CID100003117 disulfiram +CID100003121 valproate +CID100003125 alpha-methyl-p-tyrosine +CID100003143 docetaxel +CID100003148 dolasetron +CID100003151 domperidone +CID100003152 donepezil +CID100003154 dorzolamide +CID100003155 dothiepin +CID100003156 doxapram +CID100003157 doxazosin +CID100003158 doxepin +CID100003168 droperidol +CID100003171 L-threo-DOPS +CID100003182 dyphylline +CID100003198 econazole +CID100003199 trabectedin +CID100003202 edrophonium +CID100003203 efavirenz +CID100003209 eicosapentaenoic +CID100003219 emedastine +CID100003222 CAS +CID100003226 enflurane +CID100003241 epinastine +CID100003249 vitamin +CID100003250 ergonovine +CID100003251 ergotamine +CID100003255 erythromycin +CID100003256 monomycin +CID100003261 estazolam +CID100003263 estradiol +CID100003267 estradiol +CID100003268 estramustine +CID100003269 estriol +CID100003278 ethacrynic +CID100003279 ethambutol +CID100003291 ethosuximide +CID100003292 ethotoin +CID100003305 etidronate +CID100003308 etodolac +CID100003310 etoposide +CID100003324 famciclovir +CID100003325 famotidine +CID100003331 felbamate +CID100003333 felodipine +CID100003339 fenofibrate +CID100003340 fenoldopam +CID100003342 fenoprofen +CID100003345 fentanyl +CID100003348 fexofenadine +CID100003350 finasteride +CID100003354 flavoxate +CID100003355 flecainide +CID100003363 FdUrd +CID100003364 flucloxacillin +CID100003365 fluconazole +CID100003366 5-fluorocytosine +CID100003367 fludarabine +CID100003368 FAMP +CID100003370 fludrocortisone +CID100003372 fluphenazine +CID100003373 flumazenil +CID100003375 flumethasone +CID100003379 flunisolide +CID100003380 flunitrazepam +CID100003381 fluocinolone +CID100003382 fluocinonide +CID100003384 fluorometholone +CID100003385 5-FU +CID100003386 fluoxetine +CID100003387 fluoxymesterone +CID100003388 fluphenazine +CID100003392 flurandrenolone +CID100003393 flurazepam +CID100003394 flurbiprofen +CID100003397 flutamide +CID100003399 fluticasone +CID100003403 fluvastatin +CID100003404 fluvoxamine +CID100003405 folate +CID100003406 4-methylpyrazole +CID100003410 formoterol +CID100003414 foscarnet +CID100003417 fosfomycin +CID100003419 fosinopril +CID100003425 Romidepsin +CID100003440 furosemide +CID100003443 fusidic +CID100003446 gabapentin +CID100003449 galantamine +CID100003454 ganciclovir +CID100003461 gemcitabine +CID100003462 gemeprost +CID100003463 gemfibrozil +CID100003467 gentamicin +CID100003475 gliclazide +CID100003476 glimepiride +CID100003478 glipizide +CID100003488 glibenclamide +CID100003494 glycopyrrolate +CID100003510 granisetron +CID100003512 griseofulvin +CID100003516 guaifenesin +CID100003517 icodextrin +CID100003519 guanfacine +CID100003520 guanidinium +CID100003553 halcinonide +CID100003559 haloperidol +CID100003598 hexachlorophene +CID100003623 homatropine +CID100003636 Buscopan +CID100003637 hydralazine +CID100003639 hydrochlorothiazide +CID100003640 cortisol +CID100003642 hydrocortisone +CID100003647 hydroflumethiazide +CID100003648 hydromorphone +CID100003652 hydroxychloroquine +CID100003657 hydroxyurea +CID100003658 hydroxyzine +CID100003661 atropine +CID100003672 ibuprofen +CID100003675 phenelzine +CID100003676 lidocaine +CID100003685 idarubicin +CID100003687 IdUrd +CID100003690 ifosfamide +CID100003696 imipramine +CID100003702 indapamide +CID100003706 indinavir +CID100003715 indomethacin +CID100003724 iodixanol +CID100003730 iohexol +CID100003734 iopamidol +CID100003736 iopromide +CID100003737 sodium +CID100003738 iotrolan +CID100003739 iodipamide +CID100003741 ioversol +CID100003742 ioxaglate +CID100003743 ioxilan +CID100003746 ipratropium +CID100003749 irbesartan +CID100003750 irinotecan +CID100003759 isocarboxazid +CID100003763 isoflurane +CID100003767 isoniazid +CID100003777 isopropyl +CID100003779 isoproterenol +CID100003780 isosorbide +CID100003783 isoxsuprine +CID100003784 isradipine +CID100003793 itraconazole +CID100003821 ketamine +CID100003823 ketoconazole +CID100003825 ketoprofen +CID100003826 ketorolac +CID100003827 ketotifen +CID100003830 cytokinin +CID100003848 phenyllactate +CID100003869 labetalol +CID100003872 lactulose +CID100003877 lamivudine +CID100003878 lamotrigine +CID100003883 lansoprazole +CID100003890 latanoprost +CID100003899 leflunomide +CID100003902 letrozole +CID100003911 Leuprorelin +CID100003914 levobunolol +CID100003915 levocabastine +CID100003916 levomepromazine +CID100003918 dextrorphan +CID100003928 lincomycin +CID100003929 linezolid +CID100003937 Lisinopril +CID100003938 lisuride +CID100003939 LiOH +CID100003948 lomefloxacin +CID100003950 lomustine +CID100003954 loperamide +CID100003957 loratadine +CID100003958 lorazepam +CID100003961 losartan +CID100003962 lovastatin +CID100003964 loxapine +CID100003998 mafenide +CID100004004 malathion +CID100004011 maprotiline +CID100004030 mebendazole +CID100004031 mebeverine +CID100004032 mecamylamine +CID100004033 nitrogen +CID100004034 monamine +CID100004036 meclofenamate +CID100004042 medroxyprogesterone +CID100004043 medrysone +CID100004044 mefenamic +CID100004046 mefloquine +CID100004048 megestrol +CID100004051 meloxicam +CID100004053 melphalan +CID100004054 memantine +CID100004057 Cantril +CID100004058 meperidine +CID100004060 mephenytoin +CID100004062 mepivacaine +CID100004064 meprobamate +CID100004075 5-ASA +CID100004086 metaproterenol +CID100004091 metformin +CID100004095 methadone +CID100004101 methenamine +CID100004107 methocarbamol +CID100004112 methotrexate +CID100004114 8-MOP +CID100004120 N-methylscopolamine +CID100004121 methyclothiazide +CID100004138 methyldopa +CID100004139 methylene +CID100004140 methylergometrine +CID100004158 methylphenidate +CID100004159 methylprednisolone +CID100004160 methyltestosterone +CID100004163 methysergide +CID100004168 metoclopramide +CID100004170 metolazone +CID100004171 metoprolol +CID100004173 metronidazole +CID100004174 metyrapone +CID100004178 mexiletine +CID100004184 mianserin +CID100004189 miconazole +CID100004192 midazolam +CID100004195 midodrine +CID100004196 mifepristone +CID100004197 milrinone +CID100004201 minoxidil +CID100004205 mirtazapine +CID100004211 mitotane +CID100004212 mitoxantrone +CID100004235 moclobemide +CID100004236 modafinil +CID100004240 mometasone +CID100004248 montelukast +CID100004253 morphine +CID100004259 moxifloxacin +CID100004264 mupirocin +CID100004271 mycophenolate +CID100004272 mycophenolic +CID100004274 tetrofosmin +CID100004409 nabumetone +CID100004411 nadolol +CID100004419 nalbuphine +CID100004421 nalidixic +CID100004422 nalmefene +CID100004425 naloxone +CID100004428 naltrexone +CID100004432 nandrolone +CID100004436 naphazoline +CID100004440 naratriptan +CID100004443 nateglinide +CID100004449 nefazodone +CID100004450 nefopam +CID100004451 nelfinavir +CID100004454 neomycin +CID100004456 neostigmine +CID100004463 nevirapine +CID100004473 nicardipine +CID100004485 nifedipine +CID100004493 nilutamide +CID100004497 nimodipine +CID100004499 nisoldipine +CID100004506 nitrazepam +CID100004509 nitrofurantoin +CID100004510 nitroglycerin +CID100004513 nizatidine +CID100004536 norethisterone +CID100004539 norfloxacin +CID100004542 levonorgestrel +CID100004543 nortriptyline +CID100004547 repaglinide +CID100004568 nystatin +CID100004583 ofloxacin +CID100004585 olanzapine +CID100004594 omeprazole +CID100004595 ondansetron +CID100004599 orlistat +CID100004601 orphenadrine +CID100004603 oseltamivir +CID100004607 oxacillin +CID100004609 oxaliplatin +CID100004614 oxaprozin +CID100004616 oxazepam +CID100004623 oxiconazole +CID100004631 oxprenolol +CID100004634 oxybutynin +CID100004635 oxycodone +CID100004638 oxymetholone +CID100004639 oxymorphone +CID100004649 APAs +CID100004666 paclitaxel +CID100004673 pamidronate +CID100004675 pancuronium +CID100004678 dexpanthenol +CID100004679 pantoprazole +CID100004680 papaverine +CID100004689 paromomycin +CID100004691 paroxetine +CID100004695 isosulfan +CID100004724 penbutolol +CID100004725 penciclovir +CID100004727 D-penicillamine +CID100004730 penicillin +CID100004735 pentamidine +CID100004736 pentazocine +CID100004737 pentobarbital +CID100004739 pentostatin +CID100004740 pentoxifylline +CID100004745 pergolide +CID100004746 perhexiline +CID100004747 propericiazine +CID100004748 perphenazine +CID100004756 phenylazo +CID100004763 phenobarbital +CID100004768 phenoxybenzamine +CID100004771 phentermine +CID100004775 4-PBA +CID100004782 phenylephrine +CID100004786 phenylpropanolamine +CID100004810 moxonidine +CID100004811 physostigmine +CID100004812 vitamin +CID100004819 pilocarpine +CID100004828 pindolol +CID100004829 pioglitazone +CID100004834 piperacillin +CID100004845 pirbuterol +CID100004865 podophyllotoxin +CID100004868 polymyxin +CID100004870 polythiazide +CID100004885 pramipexole +CID100004889 pravastatin +CID100004891 praziquantel +CID100004893 prazosin +CID100004894 prednisolone +CID100004895 prednisolone +CID100004896 prednisolone +CID100004900 prednisone +CID100004906 prilocaine +CID100004908 primaquine +CID100004909 primidone +CID100004911 probenecid +CID100004913 procainamide +CID100004914 procaine +CID100004915 procarbazine +CID100004917 prochlorperazine +CID100004920 progesterone +CID100004923 proguanil +CID100004927 promethazine +CID100004932 propafenone +CID100004934 propantheline +CID100004935 proparacaine +CID100004943 propofol +CID100004946 propranolol +CID100004976 protriptyline +CID100004989 pyrantel +CID100004991 pyridostigmine +CID100004992 mepyramine +CID100004993 pyrimethamine +CID100004999 quazepam +CID100005002 quetiapine +CID100005005 quinapril +CID100005029 rabeprazole +CID100005032 ephedrine +CID100005035 raloxifene +CID100005038 ramipril +CID100005039 ranitidine +CID100005040 rapamycin +CID100005051 rescinnamine +CID100005052 reserpine +CID100005064 ribavirin +CID100005070 riluzole +CID100005071 rimantadine +CID100005073 risperidone +CID100005076 ritonavir +CID100005077 rivastigmine +CID100005078 rizatriptan +CID100005090 rofecoxib +CID100005095 ropinirole +CID100005106 roxithromycin +CID100005152 salmeterol +CID100005155 stavudine +CID100005161 salsalate +CID100005184 scopolamine +CID100005193 secobarbital +CID100005195 deprenyl +CID100005203 sertraline +CID100005206 sevoflurane +CID100005210 sibutramine +CID100005212 sildenafil +CID100005214 silver +CID100005215 sulfadiazine +CID100005238 sodium +CID100005245 risedronate +CID100005248 sodium +CID100005253 sotalol +CID100005257 sparfloxacin +CID100005267 spironolactone +CID100005291 imatinib +CID100005297 streptomycin +CID100005300 streptozotocin +CID100005311 vorinostat +CID100005314 succinylcholine +CID100005318 sulconazole +CID100005320 sulfacetamide +CID100005329 sulfamethoxazole +CID100005333 sulfonamide +CID100005352 sulindac +CID100005358 sumatriptan +CID100005359 suprofen +CID100005372 tacrolimus +CID100005376 tamoxifen +CID100005379 gatifloxacin +CID100005381 tazarotene +CID100005391 temazepam +CID100005394 temozolomide +CID100005396 teniposide +CID100005401 terazosin +CID100005402 terbinafine +CID100005403 terbutaline +CID100005404 terconazole +CID100005407 testolactone +CID100005408 testosterone +CID100005409 testosterone +CID100005410 testosterone +CID100005411 tetracaine +CID100005419 tetrahydrozoline +CID100005426 thalidomide +CID100005430 thiabendazole +CID100005452 thioridazine +CID100005453 thiotepa +CID100005454 thiothixene +CID100005466 tiagabine +CID100005468 tiaprofenic +CID100005470 tibolone +CID100005472 ticlopidine +CID100005478 timolol +CID100005479 tinidazole +CID100005483 tiopronin +CID100005486 tirofiban +CID100005487 tizanidine +CID100005496 tobramycin +CID100005503 tolazamide +CID100005505 tolbutamide +CID100005508 tolmetin +CID100005512 tolterodine +CID100005514 topiramate +CID100005515 topotecan +CID100005516 toremifene +CID100005523 tramadol +CID100005524 tramazoline +CID100005525 trandolapril +CID100005526 tranexamic +CID100005530 tranylcypromine +CID100005533 trazodone +CID100005538 retinoic +CID100005544 triamcinolone +CID100005546 triamterene +CID100005556 triazolam +CID100005564 triclosan +CID100005565 trien +CID100005566 trifluoperazine +CID100005572 trihexyphenidyl +CID100005576 trimethadione +CID100005577 trimethobenzamide +CID100005578 trimethoprim +CID100005582 trimetrexate +CID100005584 trimipramine +CID100005591 troglitazone +CID100005593 tropicamide +CID100005595 tropisetron +CID100005596 trospium +CID100005625 delavirdine +CID100005636 unoprostone +CID100005645 UDCA +CID100005647 VACV +CID100005650 valsartan +CID100005651 Vancocine +CID100005656 venlafaxine +CID100005665 vigabatrin +CID100005672 vinorelbine +CID100005717 zafirlukast +CID100005718 zalcitabine +CID100005719 zaleplon +CID100005721 zanamivir +CID100005726 zidovudine +CID100005727 ZnCl2 +CID100005731 zolmitriptan +CID100005732 zolpidem +CID100005734 zonisamide +CID100005735 zopiclone +CID100005746 mitomycin +CID100005771 oxytocin +CID100005775 phentolamine +CID100005877 demethyl +CID100005878 oxandrolone +CID100005939 biguanide +CID100005978 vincristine +CID100006018 tetrabenazine +CID100006049 EDTA +CID100006058 cysteamine +CID100006116 calcium +CID100006238 17-hydroxyprogesterone +CID100006256 trifluorothymidine +CID100006432 Optison +CID100006436 triamcinolone +CID100006451 bromcresol +CID100006468 phencyclidine +CID100006476 methsuximide +CID100006503 tris +CID100006726 cyclizine +CID100007012 phenylbutyric +CID100007029 diethylpropion +CID100007187 benzoyl +CID100007638 monobenzone +CID100007699 benzonatate +CID100008197 tetraen +CID100008230 argenine +CID100008612 chloroprocaine +CID100008982 nafcillin +CID100009034 methohexital +CID100009354 dimercaptosuccinic +CID100009433 aminophylline +CID100009904 nandrolone +CID100010100 propoxyphene +CID100010340 sodium +CID100010413 hydroxybutyrate +CID100010547 echothiophate +CID100010631 medroxyprogesterone +CID100010660 dimenhydrinate +CID100011125 lithium +CID100011973 Brolene +CID100012453 zuclopenthixol +CID100012460 phendimetrazine +CID100012536 desonide +CID100012555 benzydamine +CID100012559 erythromycin +CID100012597 isomannide +CID100012620 Tadenan +CID100013314 lormetazepam +CID100013342 vinblastine +CID100014789 ferumoxytol +CID100014888 arsenic +CID100014917 fluoride +CID100015232 benzathine +CID100015459 metaxalone +CID100016124 methylene +CID100016230 amiloride +CID100016362 pimozide +CID100016533 betamethasone +CID100016850 fluorescein +CID100016886 5-aza-2'-deoxycytidine +CID100017358 SonoVue +CID100018140 estramustine +CID100019090 megestrol +CID100020585 choline +CID100020969 clocortolone +CID100021800 betamethasone +CID100021945 methenamine +CID100022258 hydrocodone +CID100022318 gold +CID100022502 procaine +CID100023703 ogen +CID100023897 molindone +CID100023925 Fe(III +CID100023926 lanthanum +CID100023951 samarium +CID100023954 silver +CID100023957 technetium-99m +CID100023976 chromium +CID100023978 copper +CID100023982 gadolinium +CID100023993 yttrium +CID100023994 zinc +CID100024087 selenium +CID100024414 barium +CID100024424 zinc +CID100024450 potassium +CID100024642 TlCl +CID100024706 Triphasil +CID100024748 glucose +CID100024841 iodide +CID100025419 clodronate +CID100025959 Prussian +CID100027304 HMDP +CID100027400 pizotifen +CID100027661 isosorbide-5-mononitrate +CID100027991 desmopressin +CID100027993 conjugated +CID100028332 propoxyphene +CID100028486 lithium +CID100029089 chlorhexidine +CID100030623 dexrazoxane +CID100031072 carbimazole +CID100031264 paraldehyde +CID100031378 fludrocortisone +CID100031477 metipranolol +CID100032169 articaine +CID100032281 Protirelin +CID100032603 clindamycin +CID100032797 clobetasol +CID100032800 Asp-Tyr(SO3H)-Met-Gly-Trp-Met-Asp-Phe-NH2 +CID100034312 oxcarbazepine +CID100036339 etomidate +CID100036523 gonadorelin +CID100036811 dobutamine +CID100037392 halofantrine +CID100037720 pentosan +CID100038904 carboplatin +CID100039042 bezafibrate +CID100039507 rimexolone +CID100039524 stiripentol +CID100039764 vecuronium +CID100039860 nabilone +CID100040159 permethrin +CID100040632 pirfenidone +CID100040703 pinaverium +CID100040973 desogestrel +CID100040976 Implanon +CID100041317 acitretin +CID100041684 nitazoxanide +CID100041693 sufentanil +CID100041744 valrubicin +CID100041774 acarbose +CID100041781 torasemide +CID100042113 desflurane +CID100042395 Westcort +CID100042615 Supremon +CID100042955 misoprostol +CID100044563 lodoxamide +CID100044564 lodoxamide +CID100045469 sodium +CID100047319 atracurium +CID100047320 cisatracurium +CID100047419 cefuroxime +CID100047471 butoconazole +CID100047528 nicorandil +CID100047640 naftifine +CID100047725 Goserelin +CID100050294 nedocromil +CID100050614 cefotetan +CID100051263 alfentanil +CID100051577 miglitol +CID100051634 miglustat +CID100052421 prednicarbate +CID100054158 mebrofenin +CID100054313 iloprost +CID100054331 fosfomycin +CID100054373 Octreotide +CID100054454 simvastatin +CID100054547 cefpodoxime +CID100054688 clarithromycin +CID100054786 treprostinil +CID100054808 loteprednol +CID100054840 atomoxetine +CID100054949 NSC +CID100055331 moexiprilat +CID100055466 Gd-DTPA +CID100055480 milnacipran +CID100056338 fosphenytoin +CID100056959 ranolazine +CID100057166 Photofrin +CID100057469 imiquimod +CID100057537 rotigotine +CID100057697 pemirolast +CID100059708 levetiracetam +CID100059768 esmolol +CID100060146 tamsulosin +CID100060164 adapalene +CID100060172 adefovir +CID100060183 perindopril +CID100060184 perindopril +CID100060198 exemestane +CID100060490 zileuton +CID100060496 amlodipine +CID100060612 dexmedetomidine +CID100060613 cidofovir +CID100060668 etoposide +CID100060695 rocuronium +CID100060706 meropenem +CID100060714 gadoteridol +CID100060726 bromfenac +CID100060751 mTHPC +CID100060752 ibutilide +CID100060754 gadodiamide +CID100060787 saquinavir +CID100060795 aripiprazole +CID100060814 remifentanil +CID100060830 tiotropium +CID100060834 duloxetine +CID100060843 pemetrexed +CID100060852 ibandronate +CID100060853 ziprasidone +CID100060860 metaiodobenzylguanidine +CID100060864 olopatadine +CID100060867 levosimendan +CID100060871 adefovir +CID100060877 emtricitabine +CID100060878 eprosartan +CID100060936 tiludronate +CID100060953 capecitabine +CID100061475 sodium +CID100061799 monomethylfumarate +CID100062305 doxycycline +CID100062358 ammonium +CID100062816 colestipol +CID100062819 Metrodin +CID100062924 fluticasone +CID100062956 fosinoprilat +CID100062959 trovafloxacin +CID100062965 beclomethasone +CID100063001 benazeprilat +CID100064147 valganciclovir +CID100064778 Meropenem +CID100064929 fenofibric +CID100064987 tenofovir +CID100065014 AMD3100 +CID100065157 testosterone +CID100065281 CAS +CID100065370 Glat +CID100065628 bendamustine +CID100065840 colestimide +CID100065856 reboxetine +CID100065863 sertaconazole +CID100065866 lercanidipine +CID100065999 telmisartan +CID100068613 atosiban +CID100068740 zoledronic +CID100068844 brinzolamide +CID100069512 malvidin +CID100071158 acamprosate +CID100071273 ropivacaine +CID100071301 nebivolol +CID100071316 mivacurium +CID100071329 dofetilide +CID100071348 lanreotide +CID100071360 iloperidone +CID100071362 Hoe +CID100071406 TPGS +CID100071436 Lutalyse +CID100071469 erythromycin +CID100071616 voriconazole +CID100072022 perindoprilat +CID100072054 darifenacin +CID100072081 Terlipressin +CID100072111 rifaximin +CID100072466 bleomycin +CID100072938 gemifloxacin +CID100073303 doripenem +CID100073658 telithromycin +CID100074989 atovaquone +CID100077992 frovatriptan +CID100077993 eletriptan +CID100077996 paricalcitol +CID100077997 MK-462 +CID100077998 rosiglitazone +CID100078032 Vallergan +CID100082146 bexarotene +CID100082148 agomelatine +CID100083030 Ge-132 +CID100083513 barium +CID100083606 strontium +CID100084003 ketorolac +CID100091488 fluocinolone +CID100093860 bortezomib +CID100096312 nelarabine +CID100102258 rasburicase +CID100102399 dextran +CID100104741 ICI +CID100104758 raltitrexed +CID100104778 levodopa/carbidopa +CID100104799 fotemustine +CID100104849 rimonabant +CID100104865 bosentan +CID100105145 68Ga +CID100107694 Diane-35 +CID100107969 FTY720 +CID100107994 quinaprilat +CID100110634 vardenafil +CID100110635 tadalafil +CID100114709 eslicarbazepine +CID100115237 paliperidone +CID100115355 NTBC +CID100119182 clofarabine +CID100119212 benzylpenicilloyl +CID100119607 valdecoxib +CID100119828 parecoxib +CID100119830 tenofovir +CID100121396 N-carbamylglutamate +CID100121749 ustekinumab +CID100121892 retigabine +CID100122197 FP-CIT +CID100122316 rasagiline +CID100123015 SU5416 +CID100123597 travoprost +CID100123606 almotriptan +CID100123610 Eptifibatide +CID100123611 fondaparinux +CID100123619 etoricoxib +CID100123620 mometasone +CID100123623 Cancidas +CID100123631 gefitinib +CID100123634 Trisequens +CID100123809 Madopar +CID100124087 desloratadine +CID100125017 desvenlafaxine +CID100125889 pregabalin +CID100127909 bimatoprost +CID100128549 trandolaprilat +CID100129228 rufinamide +CID100129806 rosuvastatin +CID100130564 esomeprazole +CID100130881 olmesartan +CID100131535 fosamprenavir +CID100132804 pitavastatin +CID100132971 abiraterone +CID100132999 ivabradine +CID100134018 febuxostat +CID100134019 pixantrone +CID100134780 pomalidomide +CID100135113 sacrosidase +CID100145068 nitric +CID100147740 drospirenone +CID100147912 posaconazole +CID100148121 pralatrexate +CID100148127 ospemifene +CID100148191 temsirolimus +CID100148192 atazanavir +CID100148211 palonosetron +CID100150310 eplerenone +CID100150311 ezetimibe +CID100150610 ertapenem +CID100151075 nepafenac +CID100151165 aprepitant +CID100151171 conivaptan +CID100152945 dutasteride +CID100153941 entecavir +CID100153994 clevidipine +CID100154058 solifenacin +CID100154256 bazedoxifene +CID100156326 asenapine +CID100156418 cinacalcet +CID100157429 gadopentetate +CID100157688 vinflunine +CID100157920 lubiprostone +CID100157921 methyl +CID100158781 olmesartan +CID100158786 pegaptanib +CID100159247 sevelamer +CID100160036 MnDPDP +CID100160051 colesevelam +CID100160352 zidovudine/lamivudine +CID100163296 tipranavir +CID100166548 Anidulafungin +CID100168625 poly(styrene +CID100168924 lanthanum +CID100170361 varenicline +CID100171558 Kaluril +CID100176870 erlotinib +CID100179344 eslicarbazepine +CID100185457 S-benzoylmercaptoacetyltriglycine +CID100192155 Prednefrin +CID100193962 etravirine +CID100197281 gadobenate +CID100197712 ambrisentan +CID100206527 L-Dmp +CID100208898 dronedarone +CID100208902 ramelteon +CID100208908 lapatinib +CID100208920 NuvaRing +CID100213023 dabigatran +CID100213039 darunavir +CID100213046 lurasidone +CID100214339 luliconazole +CID100216209 aliskiren +CID100216210 dabigatran +CID100216235 sitaxsentan +CID100216237 tolvaptan +CID100216239 sorafenib +CID100216258 Colimycin +CID100216326 lenalidomide +CID100216416 lasofoxifene +CID100219024 regadenoson +CID100219078 lacosamide +CID100219084 Gd-EOB-DTPA +CID100219090 fosaprepitant +CID100222786 cortisone +CID100315411 18F-FDG +CID100358641 trimethoprim-sulfamethoxazole +CID100441332 carboprost +CID100441382 natamycin +CID100443379 CAS +CID100444006 cefditoren +CID100444013 gadoversetamide +CID100444033 ciclesonide +CID100449193 roflumilast +CID100455658 hemin +CID100477468 FK463 +CID100483407 maraviroc +CID100489129 efinaconazole +CID100517045 sodium +CID100644241 nilotinib +CID100656628 silodosin +CID100656892 TMC207 +CID100657298 propylthiouracil +CID100667490 6-mercaptopurine +CID101349907 methimazole +CID102723601 6-thioguanine +CID102761171 ethionamide +CID103006171 rilpivirine +CID103010818 d-telaprevir +CID103038497 fospropofol +CID103052762 prucalopride +CID103055172 pimecrolimus +CID103062316 dasatinib +CID103080904 Estrofem +CID103081276 fluticasone/salmeterol +CID103081361 vandetanib +CID103081362 telavancin +CID103081884 copolymer +CID103085017 sevelamer +CID103086257 Photofrin +CID103086685 axitinib +CID103086686 sunitinib +CID103325225 ALX +CID104369359 sitagliptin +CID104474778 2-hydroxysuccinaldehyde +CID104479094 doxercalciferol +CID104479097 hydroxocobalamin +CID104517618 sodium +CID104630253 alclometasone +CID104659568 entacapone +CID104659569 tolcapone +CID105001396 warfarin +CID105251896 vildagliptin +CID105273759 abacavir-lamivudine +CID105277135 elvitegravir +CID105281007 dacarbazine +CID105310993 acipimox +CID105311167 halobetasol +CID105327147 polyoxyethylene +CID105328940 bosutinib +CID105353894 pralidoxime +CID105353980 sulfasalazine +CID105359596 arsenic +CID105360237 Paroven +CID105361912 rifabutin +CID105361917 methylnaltrexone +CID105362070 balsalazide +CID105362420 verteporfin +CID105381226 rifampicin +CID105462337 olsalazine +CID105464096 ramiprilat +CID105479141 CGP +CID105487068 Mersyndol +CID105487301 tegaserod +CID105488383 ofatumumab +CID105488547 alvimopan +CID105493381 deferasirox +CID106102852 gadobutrol +CID106323497 rifapentine +CID106326970 selenium +CID106328144 radium +CID106328526 AC1O3HA7 +CID106331630 eribulin +CID106333887 auranofin +CID106398970 cefdinir +CID106433082 hexaminolevulinate +CID106433091 tapentadol +CID106433101 tafluprost +CID106433117 indacaterol +CID106433119 rivaroxaban +CID106435110 ivermectin +CID106437075 Timentin +CID106440191 fidaxomicin +CID106442177 everolimus +CID106445540 ixabepilone +CID106452749 Locorten +CID106453361 Sativex +CID106477186 delamanid +CID106850789 iron +CID106850791 leuprorelin +CID106918182 strontium +CID106918313 vilazodone +CID106918366 fosaprepitant +CID106918430 ceftobiprole +CID106918456 prasugrel +CID106918462 retapamulin +CID106918558 fesoterodine +CID106918584 sugammadex +CID106918638 belinostat +CID106918670 PEP005 +CID109794842 tasimelteon +CID109800339 olodaterol +CID109810131 ceftaroline +CID109811221 cabazitaxel +CID109812414 dalteparin +CID109815559 trans +CID109818231 tofacitinib +CID109825285 azilsartan +CID109831414 Lovaza +CID109831761 irbesartan-hydrochlorothiazide +CID109831783 Stalevo +CID109846180 eltrombopag +CID109853053 lomitapide +CID109854489 fluticasone +CID109865528 mirabegron +CID109869929 avanafil +CID109871419 ticagrelor +CID109887712 dapagliflozin +CID109898619 Depreotide +CID109912092 apremilast +CID109924495 perampanel +CID109930048 vernakalant +CID109940864 Benicar-HCT +CID109941444 SOM230 +CID109966051 Lu +CID110028615 vorapaxar +CID110096344 linagliptin +CID110107393 18F-flutemetamol +CID110113978 pazopanib +CID110163178 afatinib +CID110178705 besifloxacin +CID110182969 apixaban +CID110324367 boceprevir +CID110465263 lorcaserin +CID110482134 glycerol +CID111001318 tafamidis +CID111020241 ascorbate +CID111163584 alogliptin +CID111167602 regorafenib +CID111228026 aclidinium +CID111234049 TR-700 +CID111235728 Saxagliptin +CID111238823 azilsartan +CID111254352 sodium +CID111304743 riociguat +CID111499245 AN2690 +CID111501341 florbetaben +CID111505907 Zyprexa +CID111519069 umeclidinium +CID111531537 eliglustat +CID111556711 Carfilzomib +CID111593706 sevelamer +CID111597571 crizotinib +CID111597697 lisdexamfetamine +CID111672461 gadofosveset +CID111707110 trametinib +CID111722286 spinosad +CID111947681 nitroprusside +CID111949646 empagliflozin +CID111979316 vasopressin +CID113559279 ulipristal +CID115951529 MDV3100 +CID116004692 macitentan +CID116065945 TMC435 +CID116126651 PCI-32765 +CID116129616 salmon +CID116129617 cosyntropin +CID116129629 LY146032 +CID116129632 Nuvocid +CID116129665 Vitrum +CID116129672 Insulin +CID116129682 Forteo +CID116129690 ziconotide +CID116129701 T-A2-3 +CID116129703 Revasc +CID116129704 Bivalirudin +CID116130199 Enfuvirtide +CID116130295 BPTI +CID116130957 Org +CID116131215 Abarelix +CID116131310 hepatitis +CID116132265 ACTH(1-39 +CID116132283 glucagon +CID116132344 oCRH +CID116132418 NovoLog +CID116132438 Humalog +CID116132441 Refludan +CID116132446 Symlin +CID116134956 liraglutide +CID116136245 degarelix +CID116137271 insulin +CID116139342 Lyxumia +CID116139605 teduglutide +CID116156130 exenatide +CID116158207 linaclotide +CID116158473 o291 +CID116213095 pentastarch +CID116220172 ivacaftor +CID117754772 ruxolitinib +CID119371515 alcaftadine +CID122834577 nesiritide +CID123668479 raltegravir +CID123689036 pertechnetate +CID123690938 piroxicam +CID124762228 Azarga +CID124776445 vismodegib +CID124812758 canagliflozin +CID124822371 florbetapir +CID124826799 ponatinib +CID124838347 glucagon +CID124846132 hetastarch +CID124950485 cobicistat +CID124965990 suvorexant +CID125074470 Triptorelin +CID125074886 cetrorelix +CID125077405 Nafarelin +CID125077993 Histrelin +CID125094462 sofosbuvir +CID125102847 cabozantinib +CID125880656 doxycycline +CID125880664 tetracycline +CID126275995 minocycline +CID140468184 acenocoumarol +CID142611257 vemurafenib +CID144134877 oxytetracycline +CID144146714 Lantus +CID144201342 tesamorelin +CID144201343 ecallantide +CID144462760 dabrafenib +CID144564107 mipomersen +CID144564722 Promacta +CID144567678 dalbavancin +CID146181941 Signifor +CID146216142 dolutegravir +CID151508717 tenoxicam +CID151601240 demeclocycline +CID153477714 heparin +CID153627505 hydroxypropyl +CID154677977 A77 +CID154681041 tigecycline +CID154682541 doxycycline +CID154687131 lymecycline +CID156603655 pegaptanib +CID156842239 n-3 +CID170683024 x +CID170695640 colestyramine +CID171306834 K779 diff --git a/ddi/__init__.py b/ddi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ddi/dataset.py b/ddi/dataset.py new file mode 100644 index 0000000..561b30e --- /dev/null +++ b/ddi/dataset.py @@ -0,0 +1,213 @@ +import os +import numpy as np +import torch +from .utilities import ModelScore, ReaderWriter +from torch.utils.data import Dataset, DataLoader +from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit +from sklearn.utils.class_weight import compute_class_weight + + +class DDIDataTensor(Dataset): + + def __init__(self, X_feat, y): + self.X_feat = X_feat # tensor.float32, (drug pairs, features) + # drug interactions + self.y = y # tensor.float32, (drug pairs,) + self.num_samples = self.y.size(0) # int, number of drug pairs + + def __getitem__(self, indx): + + return(self.X_feat[indx], self.y[indx], indx) + + def __len__(self): + return(self.num_samples) + + +class PartitionDataTensor(Dataset): + + def __init__(self, ddi_datatensor, partition_ids, dsettype, fold_num): + self.ddi_datatensor = ddi_datatensor # instance of :class:`DDIDataTensor` + self.partition_ids = partition_ids # list of indices for drug pairs + self.dsettype = dsettype # string, dataset type (i.e. train, validation, test) + self.fold_num = fold_num # int, fold number + self.num_samples = len(self.partition_ids) # int, number of docs in the partition + + def __getitem__(self, indx): + target_id = self.partition_ids[indx] + return self.ddi_datatensor[target_id] + + def __len__(self): + return(self.num_samples) + + +def construct_load_dataloaders(dataset_fold, dsettypes, config, wrk_dir): + """construct dataloaders for the dataset for one fold + + Args: + dataset_fold: dictionary, + example: {'train': , + 'validation': , + 'test': , + 'class_weights': tensor([0.6957, 1.7778]) + } + dsettype: list, ['train', 'validation', 'test'] + config: dict, {'batch_size': int, 'num_workers': int} + wrk_dir: string, folder path + """ + + # setup data loaders + data_loaders = {} + epoch_loss_avgbatch = {} + epoch_loss_avgsamples = {} + flog_out = {} + score_dict = {} + class_weights = {} + for dsettype in dsettypes: + if(dsettype == 'train'): + shuffle = True + class_weights[dsettype] = dataset_fold['class_weights'] + else: + shuffle = False + class_weights[dsettype] = None + data_loaders[dsettype] = DataLoader(dataset_fold[dsettype], + batch_size=config['batch_size'], + shuffle=shuffle, + num_workers=config['num_workers']) + + epoch_loss_avgbatch[dsettype] = [] + epoch_loss_avgsamples[dsettype] = [] + score_dict[dsettype] = ModelScore(0, 0.0, 0.0, 0.0, 0.0, 0.0) # (best_epoch, auc, aupr, f1, precision, recall) + if(wrk_dir): + flog_out[dsettype] = os.path.join(wrk_dir, dsettype + ".log") + else: + flog_out[dsettype] = None + + return (data_loaders, epoch_loss_avgbatch, epoch_loss_avgsamples, score_dict, class_weights, flog_out) + +def preprocess_features(feat_fpath): + X_fea = np.loadtxt(feat_fpath,dtype=float,delimiter=",") + r, c = np.triu_indices(len(X_fea),1) # take indices off the diagnoal by 1 + return np.concatenate((X_fea[r], X_fea[c]), axis=1) +def preprocess_labels(interaction_fpath): + interaction_matrix = np.loadtxt(interaction_fpath,dtype=float,delimiter=",") + r, c = np.triu_indices(len(interaction_matrix),1) # take indices off the diagnoal by 1 + return interaction_matrix[r,c] + +def get_stratified_partitions(ddi_datatensor, num_folds=5, valid_set_portion=0.1, random_state=42): + """Generate 5-fold stratified sample of drug-pair ids based on the interaction label + + Args: + ddi_datatensor: instance of :class:`DDIDataTensor` + """ + skf_trte = StratifiedKFold(n_splits=num_folds, random_state=random_state, shuffle=True) # split train and test + data_partitions = {} + X = ddi_datatensor.X_feat + y = ddi_datatensor.y + fold_num = 0 + for train_index, test_index in skf_trte.split(X,y): + + data_partitions[fold_num] = {'train': train_index, + 'test': test_index} + print("fold_num:", fold_num) + print('train data') + report_label_distrib(y[train_index]) + print('test data') + report_label_distrib(y[test_index]) + print() + fold_num += 1 + print("-"*25) + return(data_partitions) + + +def report_label_distrib(labels): + classes, counts = np.unique(labels, return_counts=True) + norm_counts = counts/counts.sum() + for i, label in enumerate(classes): + print("class:", label, "norm count:", norm_counts[i]) + + +def validate_partitions(data_partitions, drugpairs_ids, valid_set_portion=0.1, test_set_portion=0.2): + if(not isinstance(drugpairs_ids, set)): + drugpairs_ids = set(drugpairs_ids) + num_pairs = len(drugpairs_ids) + test_set_accum = set([]) + for fold_num in data_partitions: + print('fold_num', fold_num) + tr_ids = data_partitions[fold_num]['train'] + te_ids = data_partitions[fold_num]['test'] + + tr_te = set(tr_ids).intersection(te_ids) + # assert there is no overlap among train and test partition within a fold + assert len(tr_te) == 0 + print('expected test set size:', test_set_portion*num_pairs, '; actual test set size:', len(te_ids)) + print() + assert np.abs(test_set_portion*num_pairs - len(te_ids)) <= 2 + test_set_accum = test_set_accum.union(te_ids) + # verify that assembling test sets from each of the five folds would be equivalent to all drugpair ids + assert len(test_set_accum) == num_pairs + assert test_set_accum == drugpairs_ids + print("passed intersection and overlap test (i.e. train, validation and test sets are not", + "intersecting in each fold and the concatenation of test sets from each fold is", + "equivalent to the whole dataset)") + + +def generate_partition_datatensor(ddi_datatensor, data_partitions): + datatensor_partitions = {} + for fold_num in data_partitions: + datatensor_partitions[fold_num] = {} + for dsettype in data_partitions[fold_num]: + target_ids = data_partitions[fold_num][dsettype] + datatensor_partition = PartitionDataTensor(ddi_datatensor, target_ids, dsettype, fold_num) + datatensor_partitions[fold_num][dsettype] = datatensor_partition + return(datatensor_partitions) + +def build_datatensor_partitions(data_partitions, ddi_datatensor): + datatensor_partitions = generate_partition_datatensor(ddi_datatensor, data_partitions) + compute_class_weights_per_fold_(datatensor_partitions) + return datatensor_partitions + +def compute_class_weights(labels_tensor): + classes, counts = np.unique(labels_tensor, return_counts=True) + # print("classes", classes) + # print("counts", counts) + class_weights = compute_class_weight('balanced', classes, labels_tensor.numpy()) + return class_weights + + +def compute_class_weights_per_fold_(datatensor_partitions): + """computes inverse class weights and updates the passed dictionary + + Args: + datatensor_partitions: dictionary, {fold_num, int: {datasettype, string:{datapartition, instance of + :class:`PartitionDataTensor`}}}} + + Example: + datatensor_partitions + {0: {'train': , + 'validation': , + 'test': + }, .. + } + is updated after computation of class weights to + {0: {'train': , + 'validation': , + 'test': , + 'class_weights': tensor([0.6957, 1.7778]), + }, .. + } + """ + + for fold_num in datatensor_partitions: # looping over the numbered folds + dpartition = datatensor_partitions[fold_num]['train'] + partition_ids = dpartition.partition_ids + labels = dpartition.ddi_datatensor.y[partition_ids] + datatensor_partitions[fold_num]['class_weights'] = torch.from_numpy(compute_class_weights(labels)).float() + +def read_pickles(data_dir, device): + + # Read stored data structures + data_partitions = ReaderWriter.read_data(os.path.join(data_dir, 'data_partitions.pkl')) + # instance of :class:`DDIDataTensor` + ddi_datatensor = ReaderWriter.read_tensor(os.path.join(data_dir, 'ddi_datatensor.torch'), device) + + return data_partitions, ddi_datatensor diff --git a/ddi/model.py b/ddi/model.py new file mode 100644 index 0000000..ed6719b --- /dev/null +++ b/ddi/model.py @@ -0,0 +1,52 @@ +import torch +from torch import nn +import torch.nn.functional as F + +class NDD_Paper(nn.Module): + def __init__(self, D_in=1096, H1=300, H2=400, D_out=1, drop=0.5): + super(NDD_Paper, self).__init__() + # an affine operation: y = Wx + b + self.fc1 = nn.Linear(D_in, H1) # Fully Connected + self.fc2 = nn.Linear(H1, H2) + self.fc3 = nn.Linear(H2, D_out) + self.drop = nn.Dropout(drop) + self._init_weights() + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = self.drop(x) + x = F.relu(self.fc2(x)) + x = self.drop(x) + x = self.fc3(x) + return x + + def _init_weights(self): + for m in self.modules(): + if(isinstance(m, nn.Linear)): + m.weight.data.normal_(0, 0.05) + m.bias.data.uniform_(-1,0) + + +class NDD_Code(nn.Module): + def __init__(self, D_in=1096, H1=400, H2=300, D_out=1, drop=0.5): + super(NDD_Code, self).__init__() + # an affine operation: y = Wx + b + self.fc1 = nn.Linear(D_in, H1) # Fully Connected + self.fc2 = nn.Linear(H1, H2) + self.fc3 = nn.Linear(H2, D_out) + self.drop = nn.Dropout(drop) + self._init_weights() + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = self.drop(x) + x = F.relu(self.fc2(x)) + x = self.drop(x) + x = self.fc3(x) + return x + + def _init_weights(self): + for m in self.modules(): + if(isinstance(m, nn.Linear)): + nn.init.xavier_normal_(m.weight.data) + m.bias.data.uniform_(-1,0) \ No newline at end of file diff --git a/ddi/run_workflow.py b/ddi/run_workflow.py new file mode 100644 index 0000000..67bdbbf --- /dev/null +++ b/ddi/run_workflow.py @@ -0,0 +1,353 @@ + +import os +import itertools +from .utilities import get_device, create_directory, ReaderWriter, perfmetric_report, plot_loss +from .model import NDD_Code +from .dataset import construct_load_dataloaders +import numpy as np +import pandas as pd +import torch +from torch import nn +import torch.multiprocessing as mp + + +class NDDHyperparamConfig: + def __init__(self, fc1_dim, fc2_dim, p_dropout, l2_reg, batch_size, num_epochs): + self.fc1_dim = fc1_dim + self.fc2_dim = fc2_dim + self.p_dropout = p_dropout + self.l2_reg = l2_reg + self.batch_size = batch_size + self.num_epochs = num_epochs + + def __repr__(self): + desc = " fc1_dim:{}\n fc2_dim:{}\n p_dropout:{} \n " \ + "l2_reg:{} \n batch_size:{} \n num_epochs: {}".format(self.fc1_dim, + self.fc2_dim, + self.p_dropout, + self.l2_reg, + self.batch_size, + self.num_epochs) + return desc + + +def generate_models_config(hyperparam_config, similarity_type, fold_num, fdtype): + + + # currently generic_config is shared across all models + # leaving it as placeholder such that custom generic configs could be passed :) + + + ndd_config = {'input_dim':1096, + 'fc1_dim':hyperparam_config.fc1_dim, + 'fc2_dim':hyperparam_config.fc2_dim, + 'pdropout':hyperparam_config.p_dropout, + 'to_gpu':True, + } + generic_config = {'fdtype':fdtype} + dataloader_config = {'batch_size': hyperparam_config.batch_size, + 'num_workers': 0} + config = {'dataloader_config': dataloader_config, + 'ndd_config': ndd_config, + 'generic_config': generic_config + } + + options = {'similarity_type': similarity_type, + 'fold_num': fold_num, + 'num_epochs': hyperparam_config.num_epochs, + 'weight_decay': hyperparam_config.l2_reg} + + return config, options + +def build_config_map(similarity_type): + hyperparam_config = NDDHyperparamConfig(400,300,0.5,0,200,20) + fold_num = -1 + mconfig, options = generate_models_config(hyperparam_config, similarity_type, fold_num, torch.float32) + return mconfig, options + +def dump_dict_content(dsettype_content_map, dsettypes, desc, wrk_dir): + for dsettype in dsettypes: + path = os.path.join(wrk_dir, '{}_{}.pkl'.format(desc, dsettype)) + ReaderWriter.dump_data(dsettype_content_map[dsettype], path) + + +def run_ddi(data_partition, dsettypes, config, options, wrk_dir, + state_dict_dir=None, to_gpu=True, gpu_index=0): + pid = "{}".format(os.getpid()) # process id description + # get data loader config + dataloader_config = config['dataloader_config'] + cld = construct_load_dataloaders(data_partition, dsettypes, dataloader_config, wrk_dir) + # dictionaries by dsettypes + data_loaders, epoch_loss_avgbatch, epoch_loss_avgsamples, score_dict, class_weights, flog_out = cld + # print(class_weights) + device = get_device(to_gpu, gpu_index) # gpu device + generic_config = config['generic_config'] + fdtype = generic_config['fdtype'] + if('train' in class_weights): + class_weights = class_weights['train'][1].type(fdtype).to(device) # update class weights to fdtype tensor + else: + class_weights = torch.tensor([1]).type(fdtype).to(device) # weighting all casess equally + + print("class weights", class_weights) + # loss_func = torch.nn.NLLLoss(weight=class_weights, reduction='mean') # negative log likelihood loss + # binary cross entropy + loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=class_weights, reduction='mean') + + num_epochs = options.get('num_epochs', 50) + fold_num = options.get('fold_num') + + # parse config dict + ndd_config = config['ndd_config'] + + # ddi model + ndd_model = NDD_Code(D_in=ndd_config['input_dim'], + H1=ndd_config['fc1_dim'], + H2=ndd_config['fc2_dim'], + D_out=1, + drop=ndd_config['pdropout']) + + + # define optimizer and group parameters + models_param = list(ndd_model.parameters()) + models = [(ndd_model, 'ndd_code')] + + if(state_dict_dir): # load state dictionary of saved models + num_train_epochs = 20 + for m, m_name in models: # TODO: update this as it should read best model achieved on validation set + m.load_state_dict(torch.load(os.path.join(state_dict_dir, '{}_{}.pkl'.format(m_name, num_train_epochs)), map_location=device)) + + # update models fdtype and move to device + for m, m_name in models: + m.type(fdtype).to(device) + + if('train' in data_loaders): + weight_decay = options.get('weight_decay', 1e-3) + optimizer = torch.optim.Adam(models_param, weight_decay=weight_decay, lr=1e-3) + # see paper Cyclical Learning rates for Training Neural Networks for parameters' choice + # `https://arxive.org/pdf/1506.01186.pdf` + # pytorch version >1.1, scheduler should be called after optimizer + # for cyclical lr scheduler, it should be called after each batch update + num_iter = len(data_loaders['train']) # num_train_samples/batch_size + c_step_size = int(np.ceil(5*num_iter)) # this should be 2-10 times num_iter + base_lr = 3e-4 + max_lr = 5*base_lr # 3-5 times base_lr + cyc_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr, max_lr, step_size_up=c_step_size, + mode='triangular', cycle_momentum=False) + + # store sentences' attention weights + + # if ('validation' in data_loaders): + m_state_dict_dir = create_directory(os.path.join(wrk_dir, 'model_statedict')) + + if(num_epochs > 1): + fig_dir = create_directory(os.path.join(wrk_dir, 'figures')) + + # dump config dictionaries on disk + config_dir = create_directory(os.path.join(wrk_dir, 'config')) + ReaderWriter.dump_data(config, os.path.join(config_dir, 'mconfig.pkl')) + ReaderWriter.dump_data(options, os.path.join(config_dir, 'exp_options.pkl')) + sigmoid = torch.nn.Sigmoid() + for epoch in range(num_epochs): + # print("-"*35) + for dsettype in dsettypes: + print("device: {} | similarity_type: {} | fold_num: {} | epoch: {} | dsettype: {} | pid: {}" + "".format(device, options.get('similarity_type'), fold_num, epoch, dsettype, pid)) + pred_class = [] + ref_class = [] + prob_scores = [] + ddi_ids = [] + data_loader = data_loaders[dsettype] + # total_num_samples = len(data_loader.dataset) + epoch_loss = 0. + epoch_loss_deavrg = 0. + + if(dsettype == 'train'): # should be only for train + for m, m_name in models: + m.train() + else: + for m, m_name in models: + m.eval() + + for i_batch, samples_batch in enumerate(data_loader): + # print('batch num:', i_batch) + + # zero model grad + if(dsettype == 'train'): + optimizer.zero_grad() + + X_batch, y_batch, ids = samples_batch + + X_batch = X_batch.to(device) + y_batch = y_batch.reshape(-1, 1) # TODO: reshape when preprocessing feature + y_batch = y_batch.to(device) + # print('ids', ids.shape, ids.dtype) + + with torch.set_grad_enabled(dsettype == 'train'): + # print("number of examples in batch:", docs_batch.size(0)) + num_samples_perbatch = X_batch.size(0) + # print("number_samples_per_batch", num_samples_perbatch) + y_pred_logit = ndd_model(X_batch) + y_pred_prob = sigmoid(y_pred_logit) + y_pred_clss = torch.zeros(y_pred_prob.shape, device=device, dtype=torch.int32) + y_pred_clss[y_pred_prob > 0.5] = 1 + + # print('y_pred_logit', y_pred_logit.shape, y_pred_logit.dtype) + # print('y_pred_prob', y_pred_prob.shape, y_pred_prob.dtype) + # print('y_pred_class', y_pred_clss.shape, y_pred_clss.dtype) + # print('y_batch', y_batch.shape, y_batch.dtype) + + if(dsettype == 'test'): + pred_class.extend(y_pred_clss.view(-1).tolist()) + ref_class.extend(y_batch.view(-1).tolist()) + prob_scores.extend(y_pred_prob.view(-1).tolist()) + ddi_ids.extend(ids.tolist()) + + loss = loss_func(y_pred_logit, y_batch) + if(dsettype == 'train'): + # print("computing loss") + # backward step (i.e. compute gradients) + loss.backward() + # optimzer step -- update weights + optimizer.step() + # after each batch step the scheduler + cyc_scheduler.step() + epoch_loss += loss.item() + # deaverage the loss to deal with last batch with unequal size + epoch_loss_deavrg += loss.item() * num_samples_perbatch + + # torch.cuda.ipc_collect() + # torch.cuda.empty_cache() + # end of epoch + # print("+"*35) + epoch_loss_avgbatch[dsettype].append(epoch_loss/len(data_loader)) + epoch_loss_avgsamples[dsettype].append(epoch_loss_deavrg/len(data_loader.dataset)) + + modelscore = perfmetric_report(pred_class, ref_class, prob_scores, epoch+1, flog_out[dsettype]) + perf = modelscore.s_auc + if(perf > score_dict[dsettype].s_auc): + score_dict[dsettype] = modelscore + for m, m_name in models: + torch.save(m.state_dict(), os.path.join(m_state_dict_dir, '{}_{}.pkl'.format(m_name, (epoch+1)))) + + if(num_epochs > 1): + plot_loss(epoch_loss_avgbatch, epoch_loss_avgsamples, fig_dir) + + # dump_scores + dump_dict_content(score_dict, list(score_dict.keys()), 'score', wrk_dir) + # this will run once + if(dsettype == 'test'): + # save predictions + predictions_df = build_predictions_df(ddi_ids, ref_class, pred_class, prob_scores) + predictions_path = os.path.join(wrk_dir, 'predictions.csv') + predictions_df.to_csv(predictions_path) + + # return ref_class, pred_class, prob_scores + +def build_predictions_df(ids, true_class, pred_class, prob_scores): + df_dict = { + 'id': ids, + 'true_class': true_class, + 'pred_class': pred_class, + 'prob_score_class1': prob_scores, + } + predictions_df = pd.DataFrame(df_dict) + predictions_df.set_index('id', inplace=True) + return predictions_df + + +def generate_hyperparam_space(): + fc1_dim = [400] + fc2_dim = [300] + l2_reg_vals = [0.0] + batch_size_vals = [200] + dropout_vals = [0.5] + num_epochs_vals = [20] + hyperparam_space = list(itertools.product(*[fc1_dim, fc2_dim, + dropout_vals, + l2_reg_vals, + batch_size_vals, + num_epochs_vals])) + return hyperparam_space + +def compute_numtrials(prob_interval_truemax, prob_estim): + """ computes number of trials needed for random hyperparameter search + see `algorithms for hyperparameter optimization paper + `__ + Args: + prob_interval_truemax: float, probability interval of the true optimal hyperparam, + i.e. within 5% expressed as .05 + prob_estim: float, probability/confidence level, i.e. 95% expressed as .95 + """ + n = np.log(1-prob_estim)/np.log(1-prob_interval_truemax) + return(int(np.ceil(n))+1) + + +def get_hyperparam_options(prob_interval_truemax, prob_estim, random_seed=42): + np.random.seed(random_seed) + num_trials = compute_numtrials(prob_interval_truemax, prob_estim) + hyperparam_space = generate_hyperparam_space() + if(num_trials > len(hyperparam_space)): + num_trials = len(hyperparam_space) + indxs = np.random.choice(len(hyperparam_space), size=num_trials, replace=False) + # encoder_dim, num_layers, encoder_approach, attn_method, p_dropout, l2_reg, batch_size, num_epochs + return [NDDHyperparamConfig(*hyperparam_space[indx]) for indx in indxs] + + +def get_random_simtype_fold_per_hyperparam_exp(similarity_types, random_seed=42): + """Get for each similarity type the fold number to use for identifying optimal hyperparams + """ + np.random.seed(random_seed) + simtype_fold = {} + for sim_type in similarity_types: + simtype_fold[sim_type] = np.random.randint(5) + return simtype_fold + + +def get_saved_config(config_dir): + options = ReaderWriter.read_data(os.path.join(config_dir, 'exp_options.pkl')) + mconfig = ReaderWriter.read_data(os.path.join(config_dir, 'mconfig.pkl')) + return mconfig, options + + +def get_index_argmax(score_matrix, target_indx): + argmax_indx = np.argmax(score_matrix, axis=0)[target_indx] + return argmax_indx + + +def train_val_run(datatensor_partitions, config_map, train_val_dir, fold_gpu_map, num_epochs=20): + dsettypes = ['train'] + mconfig, options = config_map + options['num_epochs'] = num_epochs # override number of epochs using user specified value + similarity_type = options['similarity_type'] + for fold_num in datatensor_partitions: + # update options fold num to the current fold + options['fold_num'] = fold_num + data_partition = datatensor_partitions[fold_num] + path = os.path.join(train_val_dir, 'train_val_{}'.format(similarity_type), 'fold_{}'.format(fold_num)) + wrk_dir = create_directory(path) + run_ddi(data_partition, dsettypes, mconfig, options, wrk_dir, + state_dict_dir=None, to_gpu=True, gpu_index=fold_gpu_map[fold_num]) + + +def test_run(datatensor_partitions, config_map, train_val_dir, test_dir, fold_gpu_map, num_epochs=1): + dsettypes = ['test'] + mconfig, options = config_map + options['num_epochs'] = num_epochs # override number of epochs using user specified value + similarity_type = options['similarity_type'] + for fold_num in datatensor_partitions: + # update options fold num to the current fold + options['fold_num'] = fold_num + data_partition = datatensor_partitions[fold_num] + path = os.path.join(train_val_dir, 'train_val_{}'.format(similarity_type), 'fold_{}'.format(fold_num)) + if os.path.exists(path): + train_dir = create_directory(path) + # load state_dict pth + state_dict_pth = os.path.join(train_dir, 'model_statedict') + path = os.path.join(test_dir, 'test_{}'.format(similarity_type), 'fold_{}'.format(fold_num)) + test_wrk_dir = create_directory(path) + run_ddi(data_partition, dsettypes, mconfig, options, test_wrk_dir, + state_dict_dir=state_dict_pth, to_gpu=True, + gpu_index=fold_gpu_map[fold_num]) + else: + print('WARNING: test dir not found: {}'.format(path)) + diff --git a/ddi/utilities.py b/ddi/utilities.py new file mode 100644 index 0000000..f7be57a --- /dev/null +++ b/ddi/utilities.py @@ -0,0 +1,330 @@ +import os +import shutil +import pickle +import torch +import numpy as np +import pandas as pd +from sklearn.metrics import classification_report, f1_score, roc_curve, precision_recall_curve, accuracy_score, \ + recall_score, precision_score, roc_auc_score, auc +from matplotlib import pyplot as plt + + +class ModelScore: + def __init__(self, best_epoch_indx, s_auc, s_aupr, s_f1, s_precision, s_recall): + self.best_epoch_indx = best_epoch_indx + self.s_auc = s_auc + self.s_aupr = s_aupr + self.s_f1 = s_f1 + self.s_precision = s_precision + self.s_recall = s_recall + + + def __repr__(self): + desc = " best_epoch_indx:{}\n auc:{} \n apur:{} \n f1:{} \n precision:{} \n recall:{} \n" \ + "".format(self.best_epoch_indx, self.s_auc, self.s_aupr, self.s_f1, self.s_precision, self.s_recall) + return desc + +def get_performance_results(similarity_type, target_dir, num_folds, dsettype): + all_perf = {} + num_metrics = 3 # number of metrics to focus on + perf_dict = [{} for i in range(num_metrics)] # track auc, aupr, f1 measure + if dsettype == 'train': + prefix = 'train_val' + else: + prefix = dsettype + for fold_num in range(num_folds): + + fold_dir = os.path.join(target_dir, + '{}_{}'.format(prefix, similarity_type), + 'fold_{}'.format(fold_num)) + + score_file = os.path.join(fold_dir, 'score_{}.pkl'.format(dsettype)) + if os.path.isfile(score_file): + mscore = ReaderWriter.read_data(score_file) + perf_dict[0]['fold{}'.format(fold_num)] = mscore.s_auc + perf_dict[1]['fold{}'.format(fold_num)] = mscore.s_aupr + perf_dict[2]['fold{}'.format(fold_num)] = mscore.s_f1 + perf_df = [] + for i in range(num_metrics): + all_perf = perf_dict[i] + all_perf_df = pd.DataFrame(all_perf, index=[similarity_type]) + median = all_perf_df.median(axis=1) + mean = all_perf_df.mean(axis=1) + stddev = all_perf_df.std(axis=1) + all_perf_df['mean'] = mean + all_perf_df['median'] = median + all_perf_df['stddev'] = stddev + perf_df.append(all_perf_df.sort_values('mean', ascending=False)) + return perf_df + + +def build_performance_dfs(similarity_types, target_dir, num_folds, dsettype): + auc_df = pd.DataFrame() + aupr_df = pd.DataFrame() + f1_df = pd.DataFrame() + for sim_type in similarity_types: + s_auc , s_aupr, s_f1 = get_performance_results(sim_type, target_dir, num_folds, dsettype) + auc_df = pd.concat([auc_df, s_auc], sort=True) + aupr_df = pd.concat([aupr_df, s_aupr], sort=True) + f1_df = pd.concat([f1_df, s_f1], sort=True) + + return auc_df, aupr_df, f1_df + + +class ReaderWriter(object): + """class for dumping, reading and logging data""" + def __init__(self): + pass + + @staticmethod + def dump_data(data, file_name, mode="wb"): + """dump data by pickling + Args: + data: data to be pickled + file_name: file path where data will be dumped + mode: specify writing options i.e. binary or unicode + """ + with open(file_name, mode) as f: + pickle.dump(data, f) + + @staticmethod + def read_data(file_name, mode="rb"): + """read dumped/pickled data + Args: + file_name: file path where data will be dumped + mode: specify writing options i.e. binary or unicode + """ + with open(file_name, mode) as f: + data = pickle.load(f) + return(data) + + @staticmethod + def dump_tensor(data, file_name): + """ + Dump a tensor using PyTorch's custom serialization. Enables re-loading the tensor on a specific gpu later. + Args: + data: Tensor + file_name: file path where data will be dumped + Returns: + """ + torch.save(data, file_name) + + @staticmethod + def read_tensor(file_name, device): + """read dumped/pickled data + Args: + file_name: file path where data will be dumped + device: the gpu to load the tensor on to + """ + data = torch.load(file_name, map_location=device) + return data + + @staticmethod + def write_log(line, outfile, mode="a"): + """write data to a file + Args: + line: string representing data to be written out + outfile: file path where data will be written/logged + mode: specify writing options i.e. append, write + """ + with open(outfile, mode) as f: + f.write(line) + + @staticmethod + def read_log(file_name, mode="r"): + """write data to a file + Args: + line: string representing data to be written out + outfile: file path where data will be written/logged + mode: specify writing options i.e. append, write + """ + with open(file_name, mode) as f: + for line in f: + yield line + + +def create_directory(folder_name, directory="current"): + """create directory/folder (if it does not exist) and returns the path of the directory + Args: + folder_name: string representing the name of the folder to be created + Keyword Arguments: + directory: string representing the directory where to create the folder + if `current` then the folder will be created in the current directory + """ + if directory == "current": + path_current_dir = os.path.dirname(__file__) # __file__ refers to utilities.py + else: + path_current_dir = directory + path_new_dir = os.path.join(path_current_dir, folder_name) + if not os.path.exists(path_new_dir): + os.makedirs(path_new_dir) + return(path_new_dir) + + +def get_device(to_gpu, index=0): + is_cuda = torch.cuda.is_available() + if(is_cuda and to_gpu): + target_device = 'cuda:{}'.format(index) + else: + target_device = 'cpu' + return torch.device(target_device) + + +def report_available_cuda_devices(): + if(torch.cuda.is_available()): + n_gpu = torch.cuda.device_count() + print('number of GPUs available:', n_gpu) + for i in range(n_gpu): + print("cuda:{}, name:{}".format(i, torch.cuda.get_device_name(i))) + device = torch.device('cuda', i) + get_cuda_device_stats(device) + print() + else: + print("no GPU devices available!!") + +def get_cuda_device_stats(device): + print('total memory available:', torch.cuda.get_device_properties(device).total_memory/(1024**3), 'GB') + print('total memory allocated on device:', torch.cuda.memory_allocated(device)/(1024**3), 'GB') + print('max memory allocated on device:', torch.cuda.max_memory_allocated(device)/(1024**3), 'GB') + print('total memory cached on device:', torch.cuda.memory_cached(device)/(1024**3), 'GB') + print('max memory cached on device:', torch.cuda.max_memory_cached(device)/(1024**3), 'GB') + +def get_interaction_stat(matrix): + w, h = matrix.shape + totalnum_elements = w*h + nonzero_elem = np.count_nonzero(matrix) + zero_elem = totalnum_elements - nonzero_elem + print('number of rows: {}, cols: {}'.format(w, h)) + print('total number of elements', totalnum_elements) + print('number of nonzero elements', nonzero_elem) + print('number of zero elements', zero_elem) + print('diagnoal elements ', np.diag(matrix)) + +def perfmetric_report(pred_target, ref_target, probscore, epoch, outlog, plot_roc=True): + + # print(ref_target.shape) + # print(pred_target.shape) + # + # print("ref_target \n", ref_target) + # print("pred_target \n", pred_target) + + + lsep = "\n" + report = "Epoch: {}".format(epoch) + lsep + report += "Classification report on all events:" + lsep + report += str(classification_report(ref_target, pred_target)) + lsep + report += "macro f1:" + lsep + macro_f1 = f1_score(ref_target, pred_target, average='macro') + report += str(macro_f1) + lsep + report += "micro f1:" + lsep + micro_f1 = f1_score(ref_target, pred_target, average='micro') + report += str(micro_f1) + lsep + report += "accuracy:" + lsep + accuracy = accuracy_score(ref_target, pred_target) + report += str(accuracy) + lsep + + s_auc = roc_auc_score(ref_target, probscore) + report += "AUC:\n" + str(s_auc) + lsep + precision_scores, recall_scores, __ = precision_recall_curve(ref_target, probscore) + s_aupr = auc(recall_scores, precision_scores) + report += "AUPR:\n" + str(s_aupr) + lsep + s_f1 = f1_score(ref_target, pred_target) + report += "binary f1:\n" + str(s_f1) + lsep + s_recall = recall_score(ref_target, pred_target) + s_precision = precision_score(ref_target, pred_target) + report += "-"*30 + lsep + + modelscore = ModelScore(epoch, s_auc, s_aupr, s_f1, s_precision, s_recall) + ReaderWriter.write_log(report, outlog) + return modelscore + + +def plot_precision_recall_curve(ref_target, prob_poslabel, figname, outdir): + pr, rec, thresholds = precision_recall_curve(ref_target, prob_poslabel) + thresholds[0] = 1 + plt.figure(figsize=(9, 6)) + plt.plot(pr, rec, 'bo', label='Precision vs Recall') + # plt.plot(np.arange(0,len(thresholds)), thresholds, 'r-', label='thresholds') + plt.xlabel('Precision') + plt.ylabel('Recall') + plt.title('Precision vs. recall curve') + plt.legend(loc='best') + plt.savefig(os.path.join(outdir, os.path.join('precisionrecall_curve_{}'.format(figname) + ".pdf"))) + plt.close() + + +def plot_roc_curve(ref_target, prob_poslabel, figname, outdir): + fpr, tpr, thresholds = roc_curve(ref_target, prob_poslabel) + thresholds[0] = 1 + plt.figure(figsize=(9, 6)) + plt.plot(fpr, tpr, 'bo', label='TPR vs FPR') + plt.plot(fpr, thresholds, 'r-', label='thresholds') + plt.xlabel('False positive rate') + plt.ylabel('True positive rate') + plt.title('ROC curve') + plt.legend(loc='best') + plt.savefig(os.path.join(outdir, os.path.join('roc_curve_{}'.format(figname) + ".pdf"))) + plt.close() + + +def plot_loss(epoch_loss_avgbatch, epoch_loss_avgsamples, wrk_dir): + dsettypes = epoch_loss_avgbatch.keys() + for dsettype in dsettypes: + plt.figure(figsize=(9, 6)) + plt.plot(epoch_loss_avgbatch[dsettype], 'r', epoch_loss_avgsamples[dsettype], 'b') + plt.xlabel("number of epochs") + plt.ylabel("negative loglikelihood cost") + plt.legend(['epoch batch average loss', 'epoch training samples average loss']) + plt.savefig(os.path.join(wrk_dir, os.path.join(dsettype + ".pdf"))) + plt.close() + + +def delete_directory(directory): + if(os.path.isdir(directory)): + shutil.rmtree(directory) + + +# code from keras https://github.com/keras-team/keras/blob/master/keras/utils/np_utils.py +def to_categorical(y, num_classes=None, dtype='float32'): + """Converts a class vector (integers) to binary class matrix. + E.g. for use with categorical_crossentropy. + # Arguments + y: class vector to be converted into a matrix + (integers from 0 to num_classes). + num_classes: total number of classes. + dtype: The data type expected by the input, as a string + (`float32`, `float64`, `int32`...) + # Returns + A binary matrix representation of the input. The classes axis + is placed last. + # Example + ```python + # Consider an array of 5 labels out of a set of 3 classes {0, 1, 2}: + > labels + array([0, 2, 1, 2, 0]) + # `to_categorical` converts this into a matrix with as many + # columns as there are classes. The number of rows + # stays the same. + > to_categorical(labels) + array([[ 1., 0., 0.], + [ 0., 0., 1.], + [ 0., 1., 0.], + [ 0., 0., 1.], + [ 1., 0., 0.]], dtype=float32) + ``` + """ + + y = np.array(y, dtype='int') + input_shape = y.shape + if input_shape and input_shape[-1] == 1 and len(input_shape) > 1: + input_shape = tuple(input_shape[:-1]) + y = y.ravel() + if not num_classes: + num_classes = np.max(y) + 1 + n = y.shape[0] + categorical = np.zeros((n, num_classes), dtype=dtype) + categorical[np.arange(n), y] = 1 + output_shape = input_shape + (num_classes,) + categorical = np.reshape(categorical, output_shape) + return categorical + diff --git a/notebooks/.ipynb_checkpoints/02_AA_Skorch_DDI-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/02_AA_Skorch_DDI-checkpoint.ipynb deleted file mode 100644 index 111f7ab..0000000 --- a/notebooks/.ipynb_checkpoints/02_AA_Skorch_DDI-checkpoint.ipynb +++ /dev/null @@ -1,581 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](https://scikit-learn.org/stable/_images/grid_search_workflow.png)" - ] - }, - { - "cell_type": "code", - "execution_count": 1230, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 1231, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import pickle\n", - "\n", - "from sklearn.datasets import make_classification\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from sklearn.model_selection import GridSearchCV\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.model_selection import StratifiedKFold\n", - "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, matthews_corrcoef, precision_recall_curve, auc\n", - "\n", - "from keras.utils import np_utils\n", - "\n", - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import TensorDataset\n", - "from torch.utils.data import Dataset\n", - "from torch.utils.data import DataLoader\n", - "from torch.utils.tensorboard import SummaryWriter\n", - "from torch.optim import SGD\n", - "\n", - "import skorch\n", - "from skorch import NeuralNetClassifier\n", - "from skorch.callbacks import EpochScoring\n", - "from skorch.callbacks import TensorBoard\n", - "from skorch.helper import predefined_split" - ] - }, - { - "cell_type": "code", - "execution_count": 1232, - "metadata": {}, - "outputs": [], - "source": [ - "# import configurations (file paths, etc.)\n", - "import yaml\n", - "try:\n", - " from yaml import CLoader as Loader, CDumper as Dumper\n", - "except ImportError:\n", - " from yaml import Loader, Dumper\n", - " \n", - "configFile = '../cluster/data/medinfmk/ddi/config/config.yml'\n", - "\n", - "with open(configFile, 'r') as ymlfile:\n", - " cfg = yaml.load(ymlfile, Loader=Loader)" - ] - }, - { - "cell_type": "code", - "execution_count": 1233, - "metadata": {}, - "outputs": [], - "source": [ - "pathInput = cfg['filePaths']['dirRaw']\n", - "pathOutput = cfg['filePaths']['dirProcessed']\n", - "# path to store python binary files (pickles)\n", - "# in order not to recalculate them every time\n", - "pathPickles = cfg['filePaths']['dirProcessedFiles']['dirPickles']\n", - "pathRuns = cfg['filePaths']['dirProcessedFiles']['dirRuns']\n", - "pathPaperScores = cfg['filePaths']['dirRawFiles']['paper-individual-metrics-scores']\n", - "datasetDirs = cfg['filePaths']['dirRawDatasets']\n", - "DS1_path = str(datasetDirs[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Helper Functions" - ] - }, - { - "cell_type": "code", - "execution_count": 1234, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_data(input_fea, input_lab, seperate=False):\n", - " offside_sim_path = input_fea\n", - " drug_interaction_matrix_path = input_lab\n", - " drug_fea = np.loadtxt(offside_sim_path,dtype=float,delimiter=\",\")\n", - " interaction = np.loadtxt(drug_interaction_matrix_path,dtype=int,delimiter=\",\")\n", - " \n", - " train = []\n", - " label = []\n", - " tmp_fea=[]\n", - " drug_fea_tmp = []\n", - " \n", - " for i in range(0, (interaction.shape[0]-1)):\n", - " for j in range((i+1), interaction.shape[1]):\n", - " label.append(interaction[i,j])\n", - " drug_fea_tmp_1 = list(drug_fea[i])\n", - " drug_fea_tmp_2 = list(drug_fea[j])\n", - " if seperate:\n", - " tmp_fea = (drug_fea_tmp_1,drug_fea_tmp_2)\n", - " else:\n", - " tmp_fea = drug_fea_tmp_1 + drug_fea_tmp_2\n", - " train.append(tmp_fea)\n", - "\n", - " return np.array(train), np.array(label)" - ] - }, - { - "cell_type": "code", - "execution_count": 1235, - "metadata": {}, - "outputs": [], - "source": [ - "def transfer_array_format(data):\n", - " formated_matrix1 = []\n", - " formated_matrix2 = []\n", - " for val in data:\n", - " formated_matrix1.append(val[0])\n", - " formated_matrix2.append(val[1])\n", - " return np.array(formated_matrix1), np.array(formated_matrix2)" - ] - }, - { - "cell_type": "code", - "execution_count": 1236, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess_labels(labels, encoder=None, categorical=True):\n", - " if not encoder:\n", - " encoder = LabelEncoder()\n", - " encoder.fit(labels)\n", - " y = encoder.transform(labels).astype(np.int32)\n", - " if categorical:\n", - " y = np_utils.to_categorical(y)\n", - "# print(y)\n", - " return y, encoder" - ] - }, - { - "cell_type": "code", - "execution_count": 1237, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess_names(labels, encoder=None, categorical=True):\n", - " if not encoder:\n", - " encoder = LabelEncoder()\n", - " encoder.fit(labels)\n", - " if categorical:\n", - " labels = np_utils.to_categorical(labels)\n", - " return labels, encoder" - ] - }, - { - "cell_type": "code", - "execution_count": 1238, - "metadata": {}, - "outputs": [], - "source": [ - "def getStratifiedKFoldSplit(X,y,n_splits):\n", - " skf = StratifiedKFold(n_splits=n_splits)\n", - " return skf.split(X,y)" - ] - }, - { - "cell_type": "code", - "execution_count": 1239, - "metadata": {}, - "outputs": [], - "source": [ - "class NDD(nn.Module):\n", - " def __init__(self, D_in=1096, H1=300, H2=400, D_out=2, drop=0.5):\n", - " super(NDD, self).__init__()\n", - " # an affine operation: y = Wx + b\n", - " self.fc1 = nn.Linear(D_in, H1) # Fully Connected\n", - " self.fc2 = nn.Linear(H1, H2)\n", - " self.fc3 = nn.Linear(H2, D_out)\n", - " self.drop = nn.Dropout(drop)\n", - " self._init_weights()\n", - "\n", - " def forward(self, x):\n", - " x = F.relu(self.fc1(x))\n", - " x = self.drop(x)\n", - " x = F.relu(self.fc2(x))\n", - " x = self.drop(x)\n", - " x = self.fc3(x)\n", - " return x\n", - " \n", - " def _init_weights(self):\n", - " for m in self.modules():\n", - " if(isinstance(m, nn.Linear)):\n", - " m.weight.data.normal_(0, 0.05)\n", - " m.bias.data.uniform_(-1,0)" - ] - }, - { - "cell_type": "code", - "execution_count": 1240, - "metadata": {}, - "outputs": [], - "source": [ - "def updateSimilarityDFSingleMetric(df, sim_type, metric, value):\n", - " df.loc[df['Similarity'] == sim_type, metric ] = round(value,3)\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 1241, - "metadata": {}, - "outputs": [], - "source": [ - "def updateSimilarityDF(df, sim_type, AUROC, AUPR, F1, Rec, Prec):\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'AUC', AUROC)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'AUPR', AUPR)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'F-measure', F1)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'Recall', Rec)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'Precision', Prec)\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 1242, - "metadata": {}, - "outputs": [], - "source": [ - "def getNetParamsStr(net, str_hidden_layers_params, net_params_to_print=[\"max_epochs\", \"batch_size\"]):\n", - " net_params = [val for sublist in [[x,net.get_params()[x]] for x in net_params_to_print] for val in sublist]\n", - " net_params_str = '-'.join(map(str, flattened))\n", - " return(net_params_str+str_hidden_layers_params)" - ] - }, - { - "cell_type": "code", - "execution_count": 1243, - "metadata": {}, - "outputs": [], - "source": [ - "def writeReplicatedIndividualScoresCSV(net, df, destination, str_hidden_layers_params):\n", - " filePath = destination + \"replicatedIndividualScores_\" + getNetParamsStr(net, str_hidden_layers_params) + \".csv\"\n", - " df.to_csv(path_or_buf = filePath, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 1244, - "metadata": {}, - "outputs": [], - "source": [ - "def getNDDClassifier(D_in, H1, H2, D_out, drop, Xy_test):\n", - " model = NDD(D_in, H1, H2, D_out, drop)\n", - " \n", - " net = NeuralNetClassifier(\n", - " model,\n", - "# criterion=nn.CrossEntropyLoss,\n", - " criterion=nn.BCEWithLogitsLoss,\n", - " max_epochs=20,\n", - " optimizer=SGD,\n", - " optimizer__lr=0.01,\n", - " optimizer__momentum=0.9, \n", - " optimizer__weight_decay=1e-6, \n", - " optimizer__nesterov=True, \n", - " batch_size=200,\n", - " callbacks=callbacks,\n", - " # Shuffle training data on each epoch\n", - " iterator_train__shuffle=True,\n", - " device=device,\n", - " train_split=predefined_split(Xy_test),\n", - " )\n", - " return net" - ] - }, - { - "cell_type": "code", - "execution_count": 1245, - "metadata": {}, - "outputs": [], - "source": [ - "def avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits):\n", - " AUROC /= kfold_nsplits\n", - " AUPR /= kfold_nsplits\n", - " F1 /= kfold_nsplits\n", - " Rec /= kfold_nsplits\n", - " Prec /= kfold_nsplits\n", - " return AUROC, AUPR, F1, Rec, Prec" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run" - ] - }, - { - "cell_type": "code", - "execution_count": 1246, - "metadata": {}, - "outputs": [], - "source": [ - "df_paperIndividualScores = pd.read_csv(pathPaperScores)\n", - "\n", - "df_replicatedIndividualScores = df_paperIndividualScores.copy()\n", - "\n", - "for col in df_replicatedIndividualScores.columns:\n", - " if col != 'Similarity':\n", - " df_replicatedIndividualScores[col].values[:] = 0" - ] - }, - { - "cell_type": "code", - "execution_count": 1247, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", - "soft = nn.Softmax(dim=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 1248, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Preparing sideeffect data...\n", - "Running fold0 for sideeffect...\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Classification metrics can't handle a mix of multilabel-indicator and binary targets", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0mmodelPicklePath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpathPickles\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"model_params/model_params_fold\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"_\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr_hidden_layers_params\u001b[0m\u001b[0;34m+\u001b[0m \u001b[0;34m\"_\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0msimilarity\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".p\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdo_train_model\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 53\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodelPicklePath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/classifier.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[0;31m# this is actually a pylint bug:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 148\u001b[0m \u001b[0;31m# https://github.com/PyCQA/pylint/issues/1085\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 149\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNeuralNetClassifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 846\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minitialize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 847\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 848\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartial_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 849\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 850\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mpartial_fit\u001b[0;34m(self, X, y, classes, **fit_params)\u001b[0m\n\u001b[1;32m 805\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnotify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'on_train_begin'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 806\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 807\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 808\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 809\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mfit_loop\u001b[0;34m(self, X, y, epochs, **fit_params)\u001b[0m\n\u001b[1;32m 760\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecord\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"valid_batch_count\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalid_batch_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 761\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 762\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnotify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'on_epoch_end'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mon_epoch_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 763\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mnotify\u001b[0;34m(self, method_name, **cb_kwargs)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcb_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcb\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallbacks_\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcb_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 284\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0;31m# pylint: disable=unused-argument\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/callbacks/scoring.py\u001b[0m in \u001b[0;36mon_epoch_end\u001b[0;34m(self, net, dataset_train, dataset_valid, **kwargs)\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mcache_net_infer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_caching\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcached_net\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 412\u001b[0;31m \u001b[0mcurrent_score\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_scoring\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcached_net\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 413\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_record_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcurrent_score\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/callbacks/scoring.py\u001b[0m in \u001b[0;36m_scoring\u001b[0;34m(self, net, X_test, y_test)\u001b[0m\n\u001b[1;32m 119\u001b[0m instead of running inference again, if available.\"\"\"\n\u001b[1;32m 120\u001b[0m \u001b[0mscorer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_scoring\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscoring_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_best_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcurrent_score\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[1;32m 167\u001b[0m stacklevel=2)\n\u001b[1;32m 168\u001b[0m return self._score(partial(_cached_call, None), estimator, X, y_true,\n\u001b[0;32m--> 169\u001b[0;31m sample_weight=sample_weight)\n\u001b[0m\u001b[1;32m 170\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_factory_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py\u001b[0m in \u001b[0;36m_score\u001b[0;34m(self, method_caller, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 211\u001b[0m return self._sign * self._score_func(y_true, y_pred,\n\u001b[0;32m--> 212\u001b[0;31m **self._kwargs)\n\u001b[0m\u001b[1;32m 213\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py\u001b[0m in \u001b[0;36maccuracy_score\u001b[0;34m(y_true, y_pred, normalize, sample_weight)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;31m# Compute accuracy for each possible representation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0my_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_check_targets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my_type\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'multilabel'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py\u001b[0m in \u001b[0;36m_check_targets\u001b[0;34m(y_true, y_pred)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_type\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m raise ValueError(\"Classification metrics can't handle a mix of {0} \"\n\u001b[0;32m---> 90\u001b[0;31m \"and {1} targets\".format(type_true, type_pred))\n\u001b[0m\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;31m# We can't have more than one value on y_type => The set is no more needed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Classification metrics can't handle a mix of multilabel-indicator and binary targets" - ] - } - ], - "source": [ - "do_prepare_data = True\n", - "do_train_model = True\n", - "kfold_nsplits = 5\n", - "# similaritiesToRun = df_paperIndividualScores['Similarity']\n", - "similaritiesToRun = [\"sideeffect\"]\n", - "\n", - "for similarity in similaritiesToRun:\n", - " input_fea = pathInput+DS1_path+\"/\" + similarity + \"_Jacarrd_sim.csv\"\n", - " input_lab = pathInput+DS1_path+\"/drug_drug_matrix.csv\"\n", - " dataPicklePath = pathPickles+\"data_X_y_\" + similarity + \"_Jaccard.p\"\n", - "\n", - " # Define model\n", - " D_in, H1, H2, D_out, drop = X.shape[1], 300, 400, 2, 0.5\n", - " str_hidden_layers_params = \"-H1-\" + str(H1) + \"-H2-\" + str(H2)\n", - " callbacks = []\n", - " \n", - " # Prepare data if not available\n", - " if do_prepare_data:\n", - " print(\"Preparing \" + similarity + \" data...\")\n", - " X,y = prepare_data(input_fea, input_lab, seperate = False)\n", - "\n", - " with open(dataPicklePath, 'wb') as f:\n", - " pickle.dump([X, y], f)\n", - "\n", - " # Load X,y and split in to train, test\n", - " with open(dataPicklePath, 'rb') as f:\n", - " X, y = pickle.load(f)\n", - " \n", - " X = X.astype(np.float32)\n", - " y = y.astype(np.int64) \n", - " \n", - " y_cat = np_utils.to_categorical(y)\n", - " \n", - " AUROC, AUPR, F1, Rec, Prec = 0,0,0,0,0\n", - " kFoldSplit = getStratifiedKFoldSplit(X,y,n_splits=kfold_nsplits)\n", - " for i, indices in enumerate(kFoldSplit):\n", - " print(\"Running fold\" + str(i) + \" for \" + similarity +\"...\")\n", - " \n", - " train_index = indices[0]\n", - " test_index = indices[1]\n", - " X_train, X_test = X[train_index], X[test_index]\n", - "# y_train, y_test = y[train_index], y[test_index]\n", - " y_train, y_test = y_cat[train_index], y_cat[test_index]\n", - " \n", - " # Create Network Classifier\n", - " Xy_test = skorch.dataset.Dataset(X_test, y_test)\n", - " net = getNDDClassifier(D_in, H1, H2, D_out, drop, Xy_test)\n", - " \n", - " # Fit and save OR load model\n", - " modelPicklePath = pathPickles+\"model_params/model_params_fold\" + str(i) + \"_\" + str_hidden_layers_params+ \"_\" + similarity + \".p\"\n", - " if do_train_model:\n", - " net.fit(X_train, y_train)\n", - " net.save_params(f_params=modelPicklePath)\n", - " else:\n", - " net.initialize() # This is important!\n", - " net.load_params(f_params=modelPicklePath)\n", - "\n", - " # Make predictions\n", - " y_pred = net.predict(X_test)\n", - " lr_probs = soft(net.forward(X_test))[:,1]\n", - " lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)\n", - "\n", - " AUROC += roc_auc_score(y_test, y_pred)\n", - " AUPR += auc(lr_recall, lr_precision)\n", - " F1 += f1_score(y_test, y_pred)\n", - " Rec += recall_score(y_test, y_pred)\n", - " Prec += precision_score(y_test, y_pred)\n", - " \n", - " print(i, similarity, AUROC, AUPR, F1, Rec, Prec)\n", - " \n", - " \n", - " AUROC, AUPR, F1, Rec, Prec = avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits)\n", - " print(similarity, AUROC, AUPR, F1, Rec, Prec)\n", - " \n", - " # Fill replicated metrics\n", - " updateSimilarityDF(df_replicatedIndividualScores, similarity, AUROC, AUPR, F1, Rec, Prec)\n", - " \n", - "# Write CSV\n", - "writeReplicatedIndividualScoresCSV(net, df_replicatedIndividualScores, pathRuns, str_hidden_layers_params)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compare to Paper" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(df_paperIndividualScores)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(df_replicatedIndividualScores)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "diff_metrics = ['AUC', 'AUPR', 'F-measure', 'Recall', 'Precision']\n", - "df_diff = df_paperIndividualScores[diff_metrics] - df_replicatedIndividualScores[diff_metrics]\n", - "df_diff_abs = df_diff.abs()\n", - "df_diff_percent = (df_diff_abs / df_paperIndividualScores[diff_metrics]) * 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_diff" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from seaborn import heatmap\n", - "heatmap(df_diff, yticklabels=df_paperIndividualScores[\"Similarity\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "heatmap(df_diff_abs, yticklabels=df_paperIndividualScores[\"Similarity\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "heatmap(df_diff_percent, yticklabels=df_paperIndividualScores[\"Similarity\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import mean_squared_error\n", - "mean_squared_error(df_paperIndividualScores[diff_metrics],\n", - " df_replicatedIndividualScores[diff_metrics])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/notebooks/02_AA_Skorch_DDI.ipynb b/notebooks/02_AA_Skorch_DDI.ipynb deleted file mode 100644 index 111f7ab..0000000 --- a/notebooks/02_AA_Skorch_DDI.ipynb +++ /dev/null @@ -1,581 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "![](https://scikit-learn.org/stable/_images/grid_search_workflow.png)" - ] - }, - { - "cell_type": "code", - "execution_count": 1230, - "metadata": {}, - "outputs": [], - "source": [ - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 1231, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "import pickle\n", - "\n", - "from sklearn.datasets import make_classification\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import LabelEncoder\n", - "from sklearn.model_selection import GridSearchCV\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.model_selection import StratifiedKFold\n", - "from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, matthews_corrcoef, precision_recall_curve, auc\n", - "\n", - "from keras.utils import np_utils\n", - "\n", - "import torch\n", - "from torch import nn\n", - "import torch.nn.functional as F\n", - "from torch.utils.data import TensorDataset\n", - "from torch.utils.data import Dataset\n", - "from torch.utils.data import DataLoader\n", - "from torch.utils.tensorboard import SummaryWriter\n", - "from torch.optim import SGD\n", - "\n", - "import skorch\n", - "from skorch import NeuralNetClassifier\n", - "from skorch.callbacks import EpochScoring\n", - "from skorch.callbacks import TensorBoard\n", - "from skorch.helper import predefined_split" - ] - }, - { - "cell_type": "code", - "execution_count": 1232, - "metadata": {}, - "outputs": [], - "source": [ - "# import configurations (file paths, etc.)\n", - "import yaml\n", - "try:\n", - " from yaml import CLoader as Loader, CDumper as Dumper\n", - "except ImportError:\n", - " from yaml import Loader, Dumper\n", - " \n", - "configFile = '../cluster/data/medinfmk/ddi/config/config.yml'\n", - "\n", - "with open(configFile, 'r') as ymlfile:\n", - " cfg = yaml.load(ymlfile, Loader=Loader)" - ] - }, - { - "cell_type": "code", - "execution_count": 1233, - "metadata": {}, - "outputs": [], - "source": [ - "pathInput = cfg['filePaths']['dirRaw']\n", - "pathOutput = cfg['filePaths']['dirProcessed']\n", - "# path to store python binary files (pickles)\n", - "# in order not to recalculate them every time\n", - "pathPickles = cfg['filePaths']['dirProcessedFiles']['dirPickles']\n", - "pathRuns = cfg['filePaths']['dirProcessedFiles']['dirRuns']\n", - "pathPaperScores = cfg['filePaths']['dirRawFiles']['paper-individual-metrics-scores']\n", - "datasetDirs = cfg['filePaths']['dirRawDatasets']\n", - "DS1_path = str(datasetDirs[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Helper Functions" - ] - }, - { - "cell_type": "code", - "execution_count": 1234, - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_data(input_fea, input_lab, seperate=False):\n", - " offside_sim_path = input_fea\n", - " drug_interaction_matrix_path = input_lab\n", - " drug_fea = np.loadtxt(offside_sim_path,dtype=float,delimiter=\",\")\n", - " interaction = np.loadtxt(drug_interaction_matrix_path,dtype=int,delimiter=\",\")\n", - " \n", - " train = []\n", - " label = []\n", - " tmp_fea=[]\n", - " drug_fea_tmp = []\n", - " \n", - " for i in range(0, (interaction.shape[0]-1)):\n", - " for j in range((i+1), interaction.shape[1]):\n", - " label.append(interaction[i,j])\n", - " drug_fea_tmp_1 = list(drug_fea[i])\n", - " drug_fea_tmp_2 = list(drug_fea[j])\n", - " if seperate:\n", - " tmp_fea = (drug_fea_tmp_1,drug_fea_tmp_2)\n", - " else:\n", - " tmp_fea = drug_fea_tmp_1 + drug_fea_tmp_2\n", - " train.append(tmp_fea)\n", - "\n", - " return np.array(train), np.array(label)" - ] - }, - { - "cell_type": "code", - "execution_count": 1235, - "metadata": {}, - "outputs": [], - "source": [ - "def transfer_array_format(data):\n", - " formated_matrix1 = []\n", - " formated_matrix2 = []\n", - " for val in data:\n", - " formated_matrix1.append(val[0])\n", - " formated_matrix2.append(val[1])\n", - " return np.array(formated_matrix1), np.array(formated_matrix2)" - ] - }, - { - "cell_type": "code", - "execution_count": 1236, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess_labels(labels, encoder=None, categorical=True):\n", - " if not encoder:\n", - " encoder = LabelEncoder()\n", - " encoder.fit(labels)\n", - " y = encoder.transform(labels).astype(np.int32)\n", - " if categorical:\n", - " y = np_utils.to_categorical(y)\n", - "# print(y)\n", - " return y, encoder" - ] - }, - { - "cell_type": "code", - "execution_count": 1237, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess_names(labels, encoder=None, categorical=True):\n", - " if not encoder:\n", - " encoder = LabelEncoder()\n", - " encoder.fit(labels)\n", - " if categorical:\n", - " labels = np_utils.to_categorical(labels)\n", - " return labels, encoder" - ] - }, - { - "cell_type": "code", - "execution_count": 1238, - "metadata": {}, - "outputs": [], - "source": [ - "def getStratifiedKFoldSplit(X,y,n_splits):\n", - " skf = StratifiedKFold(n_splits=n_splits)\n", - " return skf.split(X,y)" - ] - }, - { - "cell_type": "code", - "execution_count": 1239, - "metadata": {}, - "outputs": [], - "source": [ - "class NDD(nn.Module):\n", - " def __init__(self, D_in=1096, H1=300, H2=400, D_out=2, drop=0.5):\n", - " super(NDD, self).__init__()\n", - " # an affine operation: y = Wx + b\n", - " self.fc1 = nn.Linear(D_in, H1) # Fully Connected\n", - " self.fc2 = nn.Linear(H1, H2)\n", - " self.fc3 = nn.Linear(H2, D_out)\n", - " self.drop = nn.Dropout(drop)\n", - " self._init_weights()\n", - "\n", - " def forward(self, x):\n", - " x = F.relu(self.fc1(x))\n", - " x = self.drop(x)\n", - " x = F.relu(self.fc2(x))\n", - " x = self.drop(x)\n", - " x = self.fc3(x)\n", - " return x\n", - " \n", - " def _init_weights(self):\n", - " for m in self.modules():\n", - " if(isinstance(m, nn.Linear)):\n", - " m.weight.data.normal_(0, 0.05)\n", - " m.bias.data.uniform_(-1,0)" - ] - }, - { - "cell_type": "code", - "execution_count": 1240, - "metadata": {}, - "outputs": [], - "source": [ - "def updateSimilarityDFSingleMetric(df, sim_type, metric, value):\n", - " df.loc[df['Similarity'] == sim_type, metric ] = round(value,3)\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 1241, - "metadata": {}, - "outputs": [], - "source": [ - "def updateSimilarityDF(df, sim_type, AUROC, AUPR, F1, Rec, Prec):\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'AUC', AUROC)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'AUPR', AUPR)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'F-measure', F1)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'Recall', Rec)\n", - " df = updateSimilarityDFSingleMetric(df, sim_type, 'Precision', Prec)\n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 1242, - "metadata": {}, - "outputs": [], - "source": [ - "def getNetParamsStr(net, str_hidden_layers_params, net_params_to_print=[\"max_epochs\", \"batch_size\"]):\n", - " net_params = [val for sublist in [[x,net.get_params()[x]] for x in net_params_to_print] for val in sublist]\n", - " net_params_str = '-'.join(map(str, flattened))\n", - " return(net_params_str+str_hidden_layers_params)" - ] - }, - { - "cell_type": "code", - "execution_count": 1243, - "metadata": {}, - "outputs": [], - "source": [ - "def writeReplicatedIndividualScoresCSV(net, df, destination, str_hidden_layers_params):\n", - " filePath = destination + \"replicatedIndividualScores_\" + getNetParamsStr(net, str_hidden_layers_params) + \".csv\"\n", - " df.to_csv(path_or_buf = filePath, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 1244, - "metadata": {}, - "outputs": [], - "source": [ - "def getNDDClassifier(D_in, H1, H2, D_out, drop, Xy_test):\n", - " model = NDD(D_in, H1, H2, D_out, drop)\n", - " \n", - " net = NeuralNetClassifier(\n", - " model,\n", - "# criterion=nn.CrossEntropyLoss,\n", - " criterion=nn.BCEWithLogitsLoss,\n", - " max_epochs=20,\n", - " optimizer=SGD,\n", - " optimizer__lr=0.01,\n", - " optimizer__momentum=0.9, \n", - " optimizer__weight_decay=1e-6, \n", - " optimizer__nesterov=True, \n", - " batch_size=200,\n", - " callbacks=callbacks,\n", - " # Shuffle training data on each epoch\n", - " iterator_train__shuffle=True,\n", - " device=device,\n", - " train_split=predefined_split(Xy_test),\n", - " )\n", - " return net" - ] - }, - { - "cell_type": "code", - "execution_count": 1245, - "metadata": {}, - "outputs": [], - "source": [ - "def avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits):\n", - " AUROC /= kfold_nsplits\n", - " AUPR /= kfold_nsplits\n", - " F1 /= kfold_nsplits\n", - " Rec /= kfold_nsplits\n", - " Prec /= kfold_nsplits\n", - " return AUROC, AUPR, F1, Rec, Prec" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run" - ] - }, - { - "cell_type": "code", - "execution_count": 1246, - "metadata": {}, - "outputs": [], - "source": [ - "df_paperIndividualScores = pd.read_csv(pathPaperScores)\n", - "\n", - "df_replicatedIndividualScores = df_paperIndividualScores.copy()\n", - "\n", - "for col in df_replicatedIndividualScores.columns:\n", - " if col != 'Similarity':\n", - " df_replicatedIndividualScores[col].values[:] = 0" - ] - }, - { - "cell_type": "code", - "execution_count": 1247, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n", - "soft = nn.Softmax(dim=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 1248, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Preparing sideeffect data...\n", - "Running fold0 for sideeffect...\n" - ] - }, - { - "ename": "ValueError", - "evalue": "Classification metrics can't handle a mix of multilabel-indicator and binary targets", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0mmodelPicklePath\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpathPickles\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\"model_params/model_params_fold\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"_\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mstr_hidden_layers_params\u001b[0m\u001b[0;34m+\u001b[0m \u001b[0;34m\"_\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0msimilarity\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\".p\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 51\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdo_train_model\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 52\u001b[0;31m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 53\u001b[0m \u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodelPicklePath\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 54\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/classifier.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[0;31m# this is actually a pylint bug:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 148\u001b[0m \u001b[0;31m# https://github.com/PyCQA/pylint/issues/1085\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 149\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mNeuralNetClassifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 150\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 151\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mpredict_proba\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 846\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minitialize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 847\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 848\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartial_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 849\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 850\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mpartial_fit\u001b[0;34m(self, X, y, classes, **fit_params)\u001b[0m\n\u001b[1;32m 805\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnotify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'on_train_begin'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 806\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 807\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_loop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 808\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mKeyboardInterrupt\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 809\u001b[0m \u001b[0;32mpass\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mfit_loop\u001b[0;34m(self, X, y, epochs, **fit_params)\u001b[0m\n\u001b[1;32m 760\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecord\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"valid_batch_count\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalid_batch_count\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 761\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 762\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnotify\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'on_epoch_end'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mon_epoch_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 763\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/net.py\u001b[0m in \u001b[0;36mnotify\u001b[0;34m(self, method_name, **cb_kwargs)\u001b[0m\n\u001b[1;32m 281\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcb_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0m_\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcb\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcallbacks_\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 283\u001b[0;31m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcb_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 284\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0;31m# pylint: disable=unused-argument\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/callbacks/scoring.py\u001b[0m in \u001b[0;36mon_epoch_end\u001b[0;34m(self, net, dataset_train, dataset_valid, **kwargs)\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mcache_net_infer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muse_caching\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mcached_net\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 412\u001b[0;31m \u001b[0mcurrent_score\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_scoring\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcached_net\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 413\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_record_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcurrent_score\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/skorch/callbacks/scoring.py\u001b[0m in \u001b[0;36m_scoring\u001b[0;34m(self, net, X_test, y_test)\u001b[0m\n\u001b[1;32m 119\u001b[0m instead of running inference again, if available.\"\"\"\n\u001b[1;32m 120\u001b[0m \u001b[0mscorer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_scoring\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscoring_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mscorer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnet\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_is_best_score\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcurrent_score\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[1;32m 167\u001b[0m stacklevel=2)\n\u001b[1;32m 168\u001b[0m return self._score(partial(_cached_call, None), estimator, X, y_true,\n\u001b[0;32m--> 169\u001b[0;31m sample_weight=sample_weight)\n\u001b[0m\u001b[1;32m 170\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_factory_args\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_scorer.py\u001b[0m in \u001b[0;36m_score\u001b[0;34m(self, method_caller, estimator, X, y_true, sample_weight)\u001b[0m\n\u001b[1;32m 210\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 211\u001b[0m return self._sign * self._score_func(y_true, y_pred,\n\u001b[0;32m--> 212\u001b[0;31m **self._kwargs)\n\u001b[0m\u001b[1;32m 213\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py\u001b[0m in \u001b[0;36maccuracy_score\u001b[0;34m(y_true, y_pred, normalize, sample_weight)\u001b[0m\n\u001b[1;32m 183\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 184\u001b[0m \u001b[0;31m# Compute accuracy for each possible representation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 185\u001b[0;31m \u001b[0my_type\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_check_targets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 186\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_true\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msample_weight\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 187\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0my_type\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'multilabel'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py\u001b[0m in \u001b[0;36m_check_targets\u001b[0;34m(y_true, y_pred)\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_type\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m raise ValueError(\"Classification metrics can't handle a mix of {0} \"\n\u001b[0;32m---> 90\u001b[0;31m \"and {1} targets\".format(type_true, type_pred))\n\u001b[0m\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 92\u001b[0m \u001b[0;31m# We can't have more than one value on y_type => The set is no more needed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Classification metrics can't handle a mix of multilabel-indicator and binary targets" - ] - } - ], - "source": [ - "do_prepare_data = True\n", - "do_train_model = True\n", - "kfold_nsplits = 5\n", - "# similaritiesToRun = df_paperIndividualScores['Similarity']\n", - "similaritiesToRun = [\"sideeffect\"]\n", - "\n", - "for similarity in similaritiesToRun:\n", - " input_fea = pathInput+DS1_path+\"/\" + similarity + \"_Jacarrd_sim.csv\"\n", - " input_lab = pathInput+DS1_path+\"/drug_drug_matrix.csv\"\n", - " dataPicklePath = pathPickles+\"data_X_y_\" + similarity + \"_Jaccard.p\"\n", - "\n", - " # Define model\n", - " D_in, H1, H2, D_out, drop = X.shape[1], 300, 400, 2, 0.5\n", - " str_hidden_layers_params = \"-H1-\" + str(H1) + \"-H2-\" + str(H2)\n", - " callbacks = []\n", - " \n", - " # Prepare data if not available\n", - " if do_prepare_data:\n", - " print(\"Preparing \" + similarity + \" data...\")\n", - " X,y = prepare_data(input_fea, input_lab, seperate = False)\n", - "\n", - " with open(dataPicklePath, 'wb') as f:\n", - " pickle.dump([X, y], f)\n", - "\n", - " # Load X,y and split in to train, test\n", - " with open(dataPicklePath, 'rb') as f:\n", - " X, y = pickle.load(f)\n", - " \n", - " X = X.astype(np.float32)\n", - " y = y.astype(np.int64) \n", - " \n", - " y_cat = np_utils.to_categorical(y)\n", - " \n", - " AUROC, AUPR, F1, Rec, Prec = 0,0,0,0,0\n", - " kFoldSplit = getStratifiedKFoldSplit(X,y,n_splits=kfold_nsplits)\n", - " for i, indices in enumerate(kFoldSplit):\n", - " print(\"Running fold\" + str(i) + \" for \" + similarity +\"...\")\n", - " \n", - " train_index = indices[0]\n", - " test_index = indices[1]\n", - " X_train, X_test = X[train_index], X[test_index]\n", - "# y_train, y_test = y[train_index], y[test_index]\n", - " y_train, y_test = y_cat[train_index], y_cat[test_index]\n", - " \n", - " # Create Network Classifier\n", - " Xy_test = skorch.dataset.Dataset(X_test, y_test)\n", - " net = getNDDClassifier(D_in, H1, H2, D_out, drop, Xy_test)\n", - " \n", - " # Fit and save OR load model\n", - " modelPicklePath = pathPickles+\"model_params/model_params_fold\" + str(i) + \"_\" + str_hidden_layers_params+ \"_\" + similarity + \".p\"\n", - " if do_train_model:\n", - " net.fit(X_train, y_train)\n", - " net.save_params(f_params=modelPicklePath)\n", - " else:\n", - " net.initialize() # This is important!\n", - " net.load_params(f_params=modelPicklePath)\n", - "\n", - " # Make predictions\n", - " y_pred = net.predict(X_test)\n", - " lr_probs = soft(net.forward(X_test))[:,1]\n", - " lr_precision, lr_recall, _ = precision_recall_curve(y_test, lr_probs)\n", - "\n", - " AUROC += roc_auc_score(y_test, y_pred)\n", - " AUPR += auc(lr_recall, lr_precision)\n", - " F1 += f1_score(y_test, y_pred)\n", - " Rec += recall_score(y_test, y_pred)\n", - " Prec += precision_score(y_test, y_pred)\n", - " \n", - " print(i, similarity, AUROC, AUPR, F1, Rec, Prec)\n", - " \n", - " \n", - " AUROC, AUPR, F1, Rec, Prec = avgMetrics(AUROC, AUPR, F1, Rec, Prec, kfold_nsplits)\n", - " print(similarity, AUROC, AUPR, F1, Rec, Prec)\n", - " \n", - " # Fill replicated metrics\n", - " updateSimilarityDF(df_replicatedIndividualScores, similarity, AUROC, AUPR, F1, Rec, Prec)\n", - " \n", - "# Write CSV\n", - "writeReplicatedIndividualScoresCSV(net, df_replicatedIndividualScores, pathRuns, str_hidden_layers_params)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Compare to Paper" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "print(df_paperIndividualScores)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(df_replicatedIndividualScores)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "diff_metrics = ['AUC', 'AUPR', 'F-measure', 'Recall', 'Precision']\n", - "df_diff = df_paperIndividualScores[diff_metrics] - df_replicatedIndividualScores[diff_metrics]\n", - "df_diff_abs = df_diff.abs()\n", - "df_diff_percent = (df_diff_abs / df_paperIndividualScores[diff_metrics]) * 100" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df_diff" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from seaborn import heatmap\n", - "heatmap(df_diff, yticklabels=df_paperIndividualScores[\"Similarity\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "heatmap(df_diff_abs, yticklabels=df_paperIndividualScores[\"Similarity\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "heatmap(df_diff_percent, yticklabels=df_paperIndividualScores[\"Similarity\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import mean_squared_error\n", - "mean_squared_error(df_paperIndividualScores[diff_metrics],\n", - " df_replicatedIndividualScores[diff_metrics])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/req.txt b/req.txt index b3eed48..475bd1a 100644 --- a/req.txt +++ b/req.txt @@ -173,7 +173,7 @@ scikit-image==0.14.2 scikit-learn==0.20.3 scipy==1.2.1 seaborn==0.9.0 -SecretStorage==3.1.1 +secretstorage==3.1.1 Send2Trash==1.5.0 simplegeneric==0.8.1 singledispatch==3.4.0.3 diff --git a/req_conda.txt b/req_conda.txt new file mode 100644 index 0000000..ee6aa79 --- /dev/null +++ b/req_conda.txt @@ -0,0 +1,197 @@ +absl-py==0.8.1 +alabaster==0.7.12 +anaconda-client==1.7.2 +anaconda-navigator==1.9.7 +anaconda-project==0.8.2 +asn1crypto==0.24.0 +astor==0.8.0 +astroid==2.2.5 +astropy==3.1.2 +atomicwrites==1.3.0 +attrs==19.1.0 +Babel==2.6.0 +backcall==0.1.0 +backports.os==0.1.1 +beautifulsoup4==4.7.1 +biopython==1.73 +bitarray==0.8.3 +bkcharts==0.2 +bleach==3.1.0 +bokeh==1.0.4 +boto==2.49.0 +Bottleneck==1.2.1 +certifi==2019.3.9 +cffi==1.12.2 +chardet==3.0.4 +Click==7.0 +cloudpickle==0.8.0 +clyent==1.2.2 +colorama==0.4.1 +conda==4.6.14 +conda-build==3.17.8 +conda-verify==3.1.1 +contextlib2==0.5.5 +cryptography==2.6.1 +cycler==0.10.0 +Cython==0.29.6 +cytoolz==0.9.0.1 +dask==1.1.4 +deap==1.3.0 +decorator==4.4.0 +defusedxml==0.5.0 +distributed==1.26.0 +docutils==0.14 +entrypoints==0.3 +et-xmlfile==1.0.1 +fastcache==1.0.2 +filelock==3.0.10 +Flask==1.0.2 +future==0.17.1 +gast==0.2.2 +gevent==1.4.0 +glob2==0.6 +gmpy2==2.0.8 +google-pasta==0.1.7 +greenlet==0.4.15 +h5py==2.9.0 +heapdict==1.0.0 +html5lib==1.0.1 +idna==2.8 +imageio==2.5.0 +imagesize==1.1.0 +ipykernel==5.1.0 +ipython==7.4.0 +ipywidgets==7.4.2 +isort==4.3.16 +itsdangerous==1.1.0 +jdcal==1.4 +jedi==0.13.3 +jeepney==0.4 +Jinja2==2.10 +jsonschema==3.0.1 +jupyter==1.0.0 +jupyterlab==0.35.4 +Keras==2.3.1 +Keras-Applications==1.0.8 +Keras-Preprocessing==1.1.0 +keyring==18.0.0 +kiwisolver==1.0.1 +lazy-object-proxy==1.3.1 +llvmlite==0.28.0 +locket==0.2.0 +lxml==4.3.2 +Markdown==3.1.1 +MarkupSafe==1.1.1 +matplotlib==3.0.3 +mccabe==0.6.1 +mistune==0.8.4 +more-itertools==6.0.0 +mpmath==1.1.0 +multipledispatch==0.6.0 +navigator-updater==0.2.1 +nbconvert==5.4.1 +nbformat==4.4.0 +networkx==2.2 +nltk==3.4 +nose==1.3.7 +notebook==5.7.8 +numba==0.43.1 +numexpr==2.6.9 +numpy==1.16.2 +numpydoc==0.8.0 +olefile==0.46 +openpyxl==2.6.1 +packaging==19.0 +pandas==0.24.2 +pandocfilters==1.4.2 +parso==0.3.4 +partd==0.3.10 +path.py==11.5.0 +pathlib2==2.3.3 +patsy==0.5.1 +pep8==1.7.1 +pexpect==4.6.0 +pickleshare==0.7.5 +Pillow==5.4.1 +pkginfo==1.5.0.1 +pluggy==0.9.0 +ply==3.11 +protobuf==3.10.0 +psutil==5.6.1 +ptyprocess==0.6.0 +py==1.8.0 +pycodestyle==2.5.0 +pycosat==0.6.3 +pycparser==2.19 +pycrypto==2.6.1 +pycurl==7.43.0.2 +pyflakes==2.1.1 +Pygments==2.3.1 +pylint==2.3.1 +pyodbc==4.0.26 +pyOpenSSL==19.0.0 +pyparsing==2.3.1 +pyrsistent==0.14.11 +PySocks==1.6.8 +pytest==4.3.1 +pytest-arraydiff==0.3 +pytest-astropy==0.5.0 +pytest-doctestplus==0.3.0 +pytest-openfiles==0.3.2 +pytest-remotedata==0.3.1 +python-dateutil==2.8.0 +pytz==2018.9 +PyWavelets==1.0.2 +PyYAML==5.1 +pyzmq==18.0.0 +QtAwesome==0.5.7 +qtconsole==4.4.3 +QtPy==1.7.0 +regex==2019.4.14 +requests==2.21.0 +rope==0.12.0 +scikit-image==0.14.2 +scikit-learn==0.20.3 +scipy==1.2.1 +seaborn==0.9.0 +Send2Trash==1.5.0 +simplegeneric==0.8.1 +singledispatch==3.4.0.3 +six==1.12.0 +skorch==0.7.0 +snowballstemmer==1.2.1 +sortedcollections==1.1.2 +sortedcontainers==2.1.0 +soupsieve==1.8 +Sphinx==1.8.5 +sphinxcontrib-websupport==1.1.0 +spyder==3.3.3 +spyder-kernels==0.4.2 +SQLAlchemy==1.3.1 +statsmodels==0.9.0 +sympy==1.3 +tabulate==0.8.6 +tblib==1.3.2 +tensorboard==2.0.0 +tensorflow==2.0.0 +termcolor==1.1.0 +terminado==0.8.1 +testpath==0.4.2 +toolz==0.9.0 +tornado==6.0.2 +tqdm==4.31.1 +traitlets==4.3.2 +umap-learn==0.3.10 +unicodecsv==0.14.1 +urllib3==1.24.1 +wcwidth==0.1.7 +webencodings==0.5.1 +Werkzeug==0.14.1 +widgetsnbextension==3.4.2 +wrapt==1.11.1 +wurlitzer==1.0.2 +xlrd==1.2.0 +XlsxWriter==1.1.5 +xlwt==1.3.0 +zict==0.1.4 +zipp==0.3.3 diff --git a/req_pip.txt b/req_pip.txt new file mode 100644 index 0000000..60471d8 --- /dev/null +++ b/req_pip.txt @@ -0,0 +1,25 @@ +secretstorage==3.1.1 +jupyter-console==6.0.0 +mkl-random==1.0.2 +backports.shutil-get-terminal-size==1.0.0 +ipython-genutils==0.2.0 +grpcio==1.24.1 +jupyterlab-server==0.2.0 +mkl-fft==1.0.10 +ruamel-yaml==0.15.46 +umi-tools==1.0.0 +importlib-metadata==0.0.0 +libarchive-c==2.8 +jupyter-core==4.4.0 +tables==3.5.1 +torch==1.3.0+cpu +torchvision==0.4.1+cpu +pytorch-ignite==0.2.1 +msgpack==0.6.1 +tensorflow-estimator==2.0.1 +jupyter-client==5.2.4 +opt-einsum==3.1.0 +pysam==0.15.2 +lief==0.9.0 +prometheus-client==0.6.0 +prompt-toolkit==2.0.9 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..2d7303f --- /dev/null +++ b/setup.py @@ -0,0 +1,16 @@ +from setuptools import setup + +setup(name='ddi', + version='0.0.1', + description='', + url='https://github.com/CMI-UZH/side-effects', + packages=['ddi'], + python_requires='>=3.6.0', + install_requires=[ + 'numpy', + 'pandas', + 'scipy', + 'scikit-learn', + 'torch' + ], + zip_safe=False) \ No newline at end of file