Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature hpf training api #11

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,24 @@
FROM python:3.6-slim-buster

# Install gcc, needed to build fasttext
RUN apt-get update && apt-get install -y g++ wget wait-for-it
RUN apt-get update && apt-get install -y \
g++ \
wget \
wait-for-it \
libcurl4-openssl-dev \
libssl-dev

# Install dependencies
RUN pip3 install 'cython==0.29.14' 'scipy==1.4.0' 'tensorflow==1.13.2' 'fasttext==0.9.1' 'Flask==1.1.1' 'Orange-Bioinformatics==2.6.25' 'nested-lookup==0.2.19'
RUN pip3 install 'cython==0.29.14' 'scipy==1.4.0' 'tensorflow==1.13.2' 'fasttext==0.9.1' 'Flask==1.1.1' 'Orange-Bioinformatics==2.6.25' 'nested-lookup==0.2.19' 'pycurl==7.43.0.3'

# Put everything in /opt/ncr
RUN mkdir -p /root/opt/ncr/
WORKDIR /root/opt/ncr/
COPY . ./
RUN mkdir new_model_params/
RUN mkdir /root/uploaded_obo
RUN mkdir /root/trained_model_param
RUN mkdir /root/qsub

# This is the default command executed when starting the container
COPY startup_script.sh /
Expand Down
193 changes: 187 additions & 6 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,119 @@
import json

from functools import wraps
from flask import Flask, jsonify, redirect, request, render_template, url_for, abort
from flask import Flask, jsonify, redirect, request, render_template, url_for, abort, Response
import os
import re
from collections import OrderedDict
os.chdir(os.path.abspath(os.path.dirname(__file__)))

import requests
try:
from requests import HTTPBasicAuth
except ImportError:
from requests.auth import HTTPBasicAuth

import pycurl

import ncrmodel
import train
from generate_qsub_job import upload_json_job

CONST_HOMEDIR = os.environ['HOME']

CONST_FASTTEXT_WORD_VECTOR_FILEPATH = "{}/opt/ncr/model_params/pmc_model_new.bin" #Relative to $HOME
CONST_NEGFILE_FILEPATH = "{}/wikipedia_small.txt" #Relative to $HOME
CONST_UPLOADED_OBO_DIR = "{}/uploaded_obo" #Relative to $HOME
CONST_PARAMS_FILEPATH = "{}/trained_model_param" #Relative to $HOME
CONST_QSUB_FILEPATH = "{}/qsub" #Relative to $HOME

OBO_WEBDAV_URL = os.environ['OBO_WEBDAV_URL']
LOGGING_WEBDAV_URL = os.environ['LOGGING_WEBDAV_URL']
COMPLETE_WEBDAV_URL = os.environ['COMPLETE_WEBDAV_URL']
FAILED_WEBDAV_URL = os.environ['FAILED_WEBDAV_URL']
OUTPUT_WEBDAV_URL = os.environ['OUTPUT_WEBDAV_URL']
WEBDAV_CERTPATH = os.environ['WEBDAV_CERTPATH']
WEBDAV_APIKEY = os.environ['WEBDAV_APIKEY']

app = Flask(__name__)

#Stored in a form of {"object": model object, "threshold": threshold value}
NCR_MODELS = {}

"""
Start with at least one hard-coded model, in future versions of this API, newer trained models
can be automatically added to the NCR_MODELS data structure. For now, simply modify the following
lines to select which trained models are to be available for use.
Start with the two freely available pre-trained NCR models. This
NCR_MODELS data structure will be populated with additional models as
training jobs are submitted. Note that, unlike previous versions of this
application, only the model constructor arguments are stored as storing
more than a few trained models would exceed the memory limitations of
modern computers. Trained model objects are constructed on an as-needed
basis.
"""
NCR_MODELS['HPO'] = {}
NCR_MODELS['HPO']['object'] = ncrmodel.NCR.loadfromfile('model_params/0', 'model_params/pmc_model_new.bin')
NCR_MODELS['HPO']['object'] = ('model_params/0', 'model_params/pmc_model_new.bin')
NCR_MODELS['HPO']['threshold'] = 0.6

NCR_MODELS['MONDO'] = {}
NCR_MODELS['MONDO']['object'] = ncrmodel.NCR.loadfromfile('model_params/1', 'model_params/pmc_model_new.bin')
NCR_MODELS['MONDO']['object'] = ('model_params/1', 'model_params/pmc_model_new.bin')
NCR_MODELS['MONDO']['threshold'] = 0.6 #Just a copy+paste, should have better reasoning for selecting this value

AVAILABLE_MODEL_ID = []
def update_ncr_model_list():
if 'AUTOTEST' in os.environ:
if len(os.environ['AUTOTEST']) != 0:
return

#Check for complete jobs under /complete in WebDAV
complete_req = requests.get(COMPLETE_WEBDAV_URL + "/", verify=WEBDAV_CERTPATH, auth=HTTPBasicAuth('user', WEBDAV_APIKEY))
complete_lines = complete_req.text.split('\n')
completed_training_jobs = []
for cl in complete_lines:
if cl.startswith('<li>') and cl.endswith('</li>') and "JOBCOMPLETE_" in cl:
matches = re.compile('.+JOBCOMPLETE\_(\d+)').match(cl)
if len(matches.groups()) == 0:
continue
this_model_id = int(matches.group(1))
if this_model_id not in AVAILABLE_MODEL_ID:
completed_training_jobs.append(this_model_id)

print("completed_training_jobs = {}".format(completed_training_jobs))

#For each completed job...
for cj in completed_training_jobs:
#...download the trained model
os.mkdir("new_model_params/{}".format(cj))
for fname in ["config.json", "ncr_weights.h5", "onto.json"]:
with open("new_model_params/{}/{}".format(cj, fname), 'wb') as f:
print("Getting new_model_params/{}/{}...".format(cj, fname))
c = pycurl.Curl()
c.setopt(c.URL, OUTPUT_WEBDAV_URL + "/{}_{}".format(cj, fname))
c.setopt(c.WRITEDATA, f)
c.setopt(c.CAINFO, WEBDAV_CERTPATH)
c.setopt(c.USERPWD, "user:{}".format(WEBDAV_APIKEY))
c.perform()
c.close()

#...construct the NCR() object
#...get the given name for this model
name_req = requests.get(COMPLETE_WEBDAV_URL + "/JOBCOMPLETE_{}".format(cj), verify=WEBDAV_CERTPATH, auth=HTTPBasicAuth('user', WEBDAV_APIKEY))
new_model_name = name_req.text.rstrip()
NCR_MODELS[new_model_name] = {}
NCR_MODELS[new_model_name]['object'] = ("new_model_params/{}".format(cj), 'model_params/pmc_model_new.bin')
NCR_MODELS[new_model_name]['threshold'] = 0.6 #Just a copy+paste, should have better reasoning for selecting this value

#Don't re-download
AVAILABLE_MODEL_ID.append(cj)

#On startup, load all models from WebDAV
update_ncr_model_list()

running_job_id = 0
def generate_job_id():
global running_job_id
assign_job_id = running_job_id
running_job_id += 1
return assign_job_id

@app.route('/', methods=['POST'])
def main_page():
text = request.form['text']
Expand Down Expand Up @@ -61,6 +149,7 @@ def dated_url_for(endpoint, **values):

@app.route('/models/', methods=['GET'])
def ls_models():
update_ncr_model_list()
new_mapping = {}
for k in NCR_MODELS.keys():
new_mapping[k] = {}
Expand All @@ -78,6 +167,96 @@ def delete_model(selected_model):
return jsonify({'status': 'success'})


#Serve the webpage for model training
@app.route('/submit_training_job/', methods=['GET'])
def submit_training_job_get():
return render_template("submit_training_job.html")

#Receive the upload form including the OBO ontology file for model training
@app.route('/submit_training_job/', methods=['POST'])
def submit_training_job_post():
if 'ontology' not in request.files:
abort(400)

if 'name' not in request.form:
abort(400)

#Generate a JOB ID for this training task
j_id = generate_job_id()

ontology_file = request.files['ontology']
ontology_filepath = "{}/{}.obo".format(CONST_UPLOADED_OBO_DIR, j_id)
ontology_file.save(ontology_filepath.format(CONST_HOMEDIR))

#Upload this ontology file to WebDAV
fdata = open(ontology_filepath.format(CONST_HOMEDIR), 'rb')
requests.put(OBO_WEBDAV_URL + "/{}.obo".format(j_id), verify=WEBDAV_CERTPATH, auth=HTTPBasicAuth('user', WEBDAV_APIKEY), data=fdata)
fdata.close()

#Start the training
print("[JOB: {}] Queue'd training model {}, at root={}...".format(j_id, request.form['name'], request.form['oboroot']))
params_output_dir = CONST_PARAMS_FILEPATH + "/{}/".format(j_id)
training_proc_args = train.MainTrainArgClass(
obofile=ontology_filepath,
oboroot=request.form['oboroot'],
fasttext=CONST_FASTTEXT_WORD_VECTOR_FILEPATH,
neg_file=CONST_NEGFILE_FILEPATH,
output=params_output_dir,
verbose=True
)

upload_json_job(j_id, training_proc_args, request.form['name'])
return jsonify({'status': 'submitted', 'id': j_id})


@app.route('/log/<int:j_id>')
def get_job_logs(j_id):
#query the WebDAV server
jobs_query = requests.get(LOGGING_WEBDAV_URL, verify=WEBDAV_CERTPATH, auth=HTTPBasicAuth('user', WEBDAV_APIKEY))
job_lines = jobs_query.text.split('\n')
selected_ids = []
for jl in job_lines:
if jl.startswith('<li>') and jl.endswith('</li>'):
matches = re.compile('.+(\d+)\_(\d+)').match(jl)
if len(matches.groups()) == 0:
continue
line_jobid = int(matches.group(1))
line_messageid = int(matches.group(2))
if line_jobid == j_id:
selected_ids.append(line_messageid)
selected_ids.sort()

#Download all messages
saved_messages = ""
for message_id in selected_ids:
get_url = LOGGING_WEBDAV_URL + "/{}_{}.logmsg".format(j_id, message_id)
req = requests.get(get_url, verify=WEBDAV_CERTPATH, auth=HTTPBasicAuth('user', WEBDAV_APIKEY))
saved_messages += "{}\n".format(req.text)
return Response(saved_messages, mimetype='text/plain')

@app.route('/job/<int:j_id>')
def get_job_status(j_id):
#if j_id is >= running_job_id then j_id is invalid
if j_id >= running_job_id:
return jsonify({'status': 'invalid'})

#query the WebDAV server - is it in the COMPLETE directory
complete_query = requests.get(COMPLETE_WEBDAV_URL + "/", verify=WEBDAV_CERTPATH, auth=HTTPBasicAuth('user', WEBDAV_APIKEY))
complete_query_lines = complete_query.text.split('\n')
for ln in complete_query_lines:
if '"JOBCOMPLETE_{}"'.format(j_id) in ln:
return jsonify({'status': 'complete'})

#query the WebDAV server - is it in the FAILED directory
failed_query = requests.get(FAILED_WEBDAV_URL + "/", verify=WEBDAV_CERTPATH, auth=HTTPBasicAuth('user', WEBDAV_APIKEY))
failed_query_lines = failed_query.text.split('\n')
for ln in failed_query_lines:
if '"JOBFAIL_{}"'.format(j_id) in ln:
return jsonify({'status': 'failed'})

#otherwise, assume the job is submitted (and running/queue'd)
return jsonify({'status': 'submitted'})

"""
@api {post} /match/ POST Method
@apiName PostMatch
Expand Down Expand Up @@ -363,6 +542,7 @@ def annotate_get():
return jsonify(res)

def match(model, text):
model = ncrmodel.NCR.loadfromfile(*model)
matches = model.get_match([text], 10)[0]
res = []
for x in matches:
Expand All @@ -374,6 +554,7 @@ def match(model, text):
return {"matches":res}

def annotate(model, threshold, text):
model = ncrmodel.NCR.loadfromfile(*model)
matches = model.annotate_text(text, threshold)
res = []
for x in matches:
Expand Down
10 changes: 10 additions & 0 deletions docker_run_webapp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,17 @@
echo "Starting Docker..."
docker run --rm \
${LOCALMOUNT:+ -v $(realpath ~/ncr_model_params):/root/opt/ncr/model_params:ro} \
-v $(realpath ~/webdav_cert.pem):/root/webdav_cert.pem:ro \
-p 127.0.0.1:5000:5000 \
-e AUTOTEST=$AUTOTEST \
-e TEST_IGNORE_SCORE=$TEST_IGNORE_SCORE \
-e WEBDAV_CERTPATH=/root/webdav_cert.pem \
-e WEBDAV_APIKEY=$WEBDAV_APIKEY \
-e QSUB_WEBDAV_URL=$QSUB_WEBDAV_URL \
-e OBO_WEBDAV_URL=$OBO_WEBDAV_URL \
-e LOGGING_WEBDAV_URL=$LOGGING_WEBDAV_URL \
-e OUTPUT_WEBDAV_URL=$OUTPUT_WEBDAV_URL \
-e READY_WEBDAV_URL=$READY_WEBDAV_URL \
-e COMPLETE_WEBDAV_URL=$COMPLETE_WEBDAV_URL \
-e FAILED_WEBDAV_URL=$FAILED_WEBDAV_URL \
-it ccmsk/neuralcr
Loading