diff --git a/fileio.py b/fileio.py new file mode 100644 index 0000000..7be1dd7 --- /dev/null +++ b/fileio.py @@ -0,0 +1,36 @@ +import glob + +def load_directory(path): + # all directory names in path are class names + # all files inside a directory share label + class_paths = glob.glob(path + '/*') + class_names = list(map(lambda x: os.path.split(x)[-1], class_paths)) + file_names = {x: glob.glob(os.path.join(path,x,'*')) for x in class_names} + return class_names, file_names + + +def pickle_results(path,file_name,data): + if not os.path.exists(path): + os.makedirs(path) + f = open(os.path.join(path,file_name),'w+') + pickle.dump(data,f) + f.close() + return True + + +def file_dict_to_flat(file_dict): + file_list = [] + for class_name in file_dict: + file_list.extend( file_dict[class_name]) + return file_list + + +def file_list_to_dict(file_list): + file_dict = {} + for f in file_list: + class_name = f.split('/')[-2] + if class_name in file_dict: + file_dict[class_name].append(f) + else: + file_dict[class_name] = [f] + return file_dict diff --git a/hotdog.jpg b/hotdog.jpg new file mode 100644 index 0000000..d94d7de Binary files /dev/null and b/hotdog.jpg differ diff --git a/inference.sh b/inference.sh new file mode 100644 index 0000000..e53c214 --- /dev/null +++ b/inference.sh @@ -0,0 +1,7 @@ +curl -X POST \ + http://127.0.0.1:3031/inceptionV3/predict \ + -H 'Cache-Control: no-cache' \ + -H 'Postman-Token: eeedb319-2218-44b9-86eb-63a3a1f62e14' \ + -H 'content-type: multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW' \ + -F image=@$1 \ + -F model_name=$2 diff --git a/labeling.py b/labeling.py new file mode 100644 index 0000000..e0a4fa9 --- /dev/null +++ b/labeling.py @@ -0,0 +1,145 @@ +import numpy as np +from keras.models import load_model +from keras.preprocessing import image +from keras.applications.inception_v3 import InceptionV3, preprocess_input +from sklearn.metrics import pairwise_distances_argmin_min + + + + +def labeling_priority(data_unlabeled, trained_model,n,method='ed',data_labeled=None): + if method == 'random': + return randomly_choose_n(data_unlabeled, n) + if method == 'ed': + unlabeled_features = feature_extraction(data_unlabeled, trained_model) + points = pick_points_faster(unlabeled_features, trained_model, n) + labeled_files = [data_unlabeled[idx] for idx in points] + return labeled_files + + +def check_label_top_n(result, label, n=1): + # returns True if label is in the top n, false otherwise + result = result['data']['prediction'][:n] + for r in result: + if label in r['label']: + return True + return False + + +def choose_n_from_each_class(file_dict, n): + ret_dict = {} + for class_name in file_dict: + ret_dict[class_name] = [] + for i in range(n): + if file_dict[class_name]: + ret_dict[class_name].append(file_dict[class_name][0]) + del file_dict[class_name][0] + return ret_dict + + +def run_inference_on_dict(file_dict, model_name='base', + url='http://127.0.0.1:3031/inceptionV3/predict'): + results = {} + for class_name in file_dict: + results[class_name] = [] + for dp in file_dict[class_name]: + results[class_name].append(query_inference_server(dp, model_name,url)) + return results + + + + + + + + +def choose_n(file_dict, n): +#selects n from each class + ret_dict = {} + for class_name in file_dict: + if file_dict[class_name]: + ret_dict[class_name] = file_dict[class_name][:n] + return ret_dict + + +def randomly_choose_n(file_list, n): +#randomly selects n total files + random.seed(90210) + return random.sample(file_list, n) + + +def compute_accuracy(predictions,class_name): + res = predictions[class_name] + correct = sum(res[x]['data']['prediction'][0]['label'] == class_name + for x in range(len(res))) + return float(correct) / len(res) + + +def feature_extraction(file_names,your_model): +#given a list of images and model, returns a list of ndarray feature weights + feature_list = [] + for f in file_names: + + img = image.load_img(f,target_size=(299, 299)) + + x = np.expand_dims(image.img_to_array(img), axis=0) + x = preprocess_input(x) + + feature = your_model.predict(x) + feature_np = np.array(feature) + feature_list.append(feature_np.flatten()) + + return feature_list + + +def pick_points_faster(unlabeled_features, model, n,labeled_features=[]): + from sklearn.metrics import pairwise_distances_argmin_min + from sklearn.metrics.pairwise import euclidean_distances as ed + labels = [] + if len(labeled_eatures) == 0: + labeled_features = [unlabeled_features[0]] + labels.append(0) + n = n - 1 + + indices, values = pairwise_distances_argmin_min(unlabeled_features, + labeled_features) + for i in range(n): + + max_of_min = np.argmax(values) + printme = values[max_of_min] + labeled_features.append(unlabeled_features[max_of_min]) + labels.append(max_of_min) + indices, values_new = pairwise_distances_argmin_min(unlabeled_features, + [labeled_features[-1]]) + for j in range(len(unlabeled_features)): + if values_new[j] < values[j]: + values[j] = values_new[j] + + return labels + + +def cluster_label(file_names,your_model,n): + feature_list = [] + for f in file_names: + img = image.load_img(f,target_size=(299, 299)) + + x = np.expand_dims(image.img_to_array(img), axis=0) + x = preprocess_input(x) +# x = x.copy(order="C") + + feature = your_model.predict(x) + feature_np = np.array(feature) + feature_list.append(feature_np.flatten()) + + feature_list_np = np.array(feature_list) + kmeans = KMeans(n_clusters=n, random_state=0).fit(feature_list_np) + return kmeans + + + + +#keras +#requests +#boto3 +#tensorflow +#sklearn.metrics diff --git a/main.py b/main.py new file mode 100644 index 0000000..d732f96 --- /dev/null +++ b/main.py @@ -0,0 +1,124 @@ +import boto3 +import requests +import os +import random +import shutil +import pickle +import time +from fileio import * +from sherlockWrapper import * +from labeling import * + +def random_images_loop(model_name, file_loc, base_model='inceptionV3', N_initial=100, + bucket='insightai2019', ip_addr='http://127.0.0.1:3031/'): + #main body for running random + output_path = './results/' + model_name + transfer_url = ip_addr + base_model + '/transfer' + inference_url = ip_addr + base_model + '/predict' + status_url = ip_addr + 'tasks/info' + + class_names, file_names = loadDirectory('./' + file_loc + '/train/') + validate_class_names, validate_file_names = loadDirectory('./' + + file_loc + '/val/') + test_class_names, test_file_names = loadDirectory('./' + file_loc + '/test/') + + train_dict = choose_n(file_names, N_initial) + + upload_to_S3(train_dict,os.path.join('models',model_name,'train')) + upload_to_S3(random_file_dict,os.path.join('models',model_name,'train')) + upload_to_S3(validate_file_names, os.path.join('models',model_name,'val')) + + r = train_new_model(model_name, bucket_name='insightai2019', path_prefix='models', + url=transfer_url) + wait_for_training(r) + rid = r['task_id'] + response = requests.post(status_url,json={rid:rid}) + r_acc = response.json() + test_random = run_inference_on_dict(test_file_names, model_name) + acc_random = [] + for k in test_random: + acc_random.append(compute_accuracy(test_random,k)) + + save_file_name = 'r{}.pickle'.format(0) + pickle_results(output_path, save_file_name, [r_acc,test_random]) + return r_acc, test_random + + +def non_random_images_loop(model_name, file_loc, base_model='inceptionV3', N_initial=100, + bucket='insightai2019', ip_addr='http://127.0.0.1:3031'): +# model_name = 'imgnet11.maxpool.584' +# file_loc = 'imgnetmodel' +# base_model = 'inceptionV3' +# N_initial = 584 +# bucket = 'insightai2019' +# ip_addr='http://127.0.0.1:3031/' + + output_path = './results/' + model_name + transfer_url = ip_addr + base_model + '/transfer' + inference_url = ip_addr + base_model + '/predict' + status_url = ip_addr + 'tasks/info' + retrain_url=ip_addr + 'inceptionV3/retrain' + +# iv3 = InceptionV3(weights='imagenet',input_shape=(299,299,3)) + iv3_topless = InceptionV3(include_top=False, weights='imagenet',pooling=max, + input_shape=(299,299,3)) + + class_names, file_names = loadDirectory('./' + file_loc + '/train/') + validate_class_names, validate_file_names = loadDirectory('./' + + file_loc + '/val/') + class_names, test_file_names = loadDirectory('./' + file_loc + '/test/') + + file_list = [] + file_labels = [] + for k in file_names: + file_list.extend(file_names[k]) + file_labels.extend([k] * len(file_names[k])) + + unlabeled_features = feature_extraction(file_list, iv3_topless) + points = pick_points_faster(unlabeled_features, iv3_topless, N_initial) + labeled_files = [file_list[idx] for idx in points] + + upload_dict = {k :[] for k in class_names} + for idx in points: + upload_dict[file_labels[idx]].append(file_list[idx]) + upload_to_S3(upload_dict,os.path.join('models',model_name,'train')) + upload_to_S3(validate_file_names, os.path.join('models',model_name,'val')) + + r = train_new_model(model_name, bucket_name='insightai2019', path_prefix='models', + url=transfer_url) + wait_for_training(r) + rid = r['task_id'] + response = requests.post(status_url,json={rid:rid}) + train_acc = response.json()#83.6 training, 81.6 validation + test_results = runInferenceOnDict(test_file_names, model_name) + test_acc = [] + for k in test_results: + test_acc.append(computeAccuracy(test_results,k)) + + return train_acc, test_acc + + +def main(model_name, base_model='inceptionV3', N_initial=5, + iterations=1, labelsPerRound=5, bucket='insightai2019', + ip_addr='http://127.0.0.1:3031/'): + + model_name = 'HotWineBike1kRandom' + base_model = 'inceptionV3' + N_initial = 25 + iterations = 1 + labelsPerRound = 25 + bucket = 'insightai2019' + output_path = './results/' + model_name + ip_addr='http://127.0.0.1:3031/' + transfer_url = ip_addr + base_model + '/transfer' + inference_url = ip_addr + base_model + '/predict' + status_url = ip_addr + 'tasks/info' + retrain_url=ip_addr + 'inceptionV3/retrain' + iv3 = InceptionV3(weights='imagenet',input_shape=(299,299,3)) + iv3_topless = InceptionV3(include_top=False, weights='imagenet',input_shape=(299,299,3)) + # load the images - array of Images + + +if __name__ == '__main__': + main('tomato_potato') + diff --git a/misc.py b/misc.py new file mode 100644 index 0000000..5445877 --- /dev/null +++ b/misc.py @@ -0,0 +1,17 @@ +def train_test_validate(path, model_path, n_test=50, n_val=50, + names=['test', 'val', 'train'] ): + class_name = os.path.split(path)[-1] + file_names = glob.glob(os.path.join(path,'*')) + destinations = map(lambda x: os.path.join(model_path, x, + class_name), names) + random.shuffle(file_names) + for d in destinations: + if not os.path.isdir(d): + os.makedirs(d) + + map(lambda x: shutil.move(x, os.path.join(destinations[0], + os.path.split(x)[-1])), file_names[0:n_test] ) + map(lambda x: shutil.move(x, os.path.join(destinations[1], + os.path.split(x)[-1])), file_names[nTest:n_test + n_val] ) + map(lambda x: shutil.move(x, os.path.join(destinations[2], + os.path.split(x)[-1])), file_names[n_test + n_val:] ) diff --git a/sherlockWrapper.py b/sherlockWrapper.py new file mode 100644 index 0000000..58ba29a --- /dev/null +++ b/sherlockWrapper.py @@ -0,0 +1,57 @@ +def query_inference_server(fileName, model_name='base', + url='http://127.0.0.1:3031/inceptionV3/predict'): + form_data = {'model_name': model_name} + files = {'image': open(fileName, 'rb')} + response = requests.post(url, files=files, data=form_data) + return response.json() + + +def train_new_model(model_name, bucket_name='insightai2019', path_prefix='models', + url='http://127.0.0.1:3031/inceptionV3/transfer'): + form_data = { + 'train_bucket_name': bucket_name, + 'train_bucket_prefix': os.path.join(path_prefix, model_name) + } + + response = requests.post(url, data=form_data) + return response.json() + + +def check_status(id, url='http://127.0.0.1:3031/tasks/info'): + response = requests.post(url,json={id:id}) + return response.json()['Tasks Status'][0]['status'] == 'SUCCESS' + + +def retrain_model(model_name, path, bucket_name='insightai2019', + nb_epoch=3, batch_size=2, + url='http://127.0.0.1:3031/inceptionV3/retrain'): + form_data = { + 'nb_epoch': nb_epoch, + 'batch_size': batch_size, + 'train_bucket_name': bucket_name, + 'train_bucket_prefix': os.path.join(path, model_name) + } + + response = requests.post(url, data=form_data) + return response.json() + + +def upload_to_S3(file_dict, key_path, bucket_name='insightai2019'): + # push all files in file_dict to S3 + s3 = boto3.client('s3') + for key in file_dict: + for datapoint in file_dict[key]: + fname= os.path.split(datapoint)[-1] + file_key = os.path.join(key_path,key,fname) + print file_key + s3.upload_file(datapoint, bucket_name, file_key) + + +def wait_for_training(response, t=20, t_max=900, + url='http://127.0.0.1:3031/tasks/info'): + status = check_status(response['task_id'],url) + while not status: + time.sleep(t) + t += t / 10 + status = check_status(response['task_id'],url) + return 1 diff --git a/src/app/apis/InceptionV3/API_helpers.py b/src/app/apis/InceptionV3/API_helpers.py index 2beef9b..410701b 100644 --- a/src/app/apis/InceptionV3/API_helpers.py +++ b/src/app/apis/InceptionV3/API_helpers.py @@ -41,7 +41,7 @@ def save_classes_label_dict(label_dict, file_path_name): with open(file_path_name, 'w') as fp: json.dump(label_dict, fp) - print "* Helper: Classes Label Json Saved" + print "*Helper: Classes Label Json Saved" def download_a_dir_from_s3(bucket_name, bucket_prefix, local_path): """ @@ -51,7 +51,7 @@ def download_a_dir_from_s3(bucket_name, bucket_prefix, local_path): Will not download if the local folder already exists """ - print "* Helper: Loading Images from S3 {} {}".format(bucket_name,bucket_prefix) + print "*Helper: Loading Images from S3 {} {}".format(bucket_name, bucket_prefix) output_path = os.path.join(local_path, bucket_prefix) if not os.path.exists(os.path.join(output_path, 'train')): @@ -69,12 +69,17 @@ def download_a_dir_from_s3(bucket_name, bucket_prefix, local_path): os.makedirs(save_path) except OSError: pass - mybucket.download_file(obj.key, os.path.join(save_path, filename)) - + try: + mybucket.download_file(obj.key, + os.path.join(save_path, filename)) + #iterator contains keys for directories and files + except OSError: + pass + print "* Helper: Images Loaded at: {}".format(output_path) return output_path - \ No newline at end of file + diff --git a/src/app/apis/InceptionV3/inceptionV3.py b/src/app/apis/InceptionV3/inceptionV3.py index 8c2caa4..4bb2d9b 100644 --- a/src/app/apis/InceptionV3/inceptionV3.py +++ b/src/app/apis/InceptionV3/inceptionV3.py @@ -48,12 +48,14 @@ def label(): s3_bucket_name = request.form.get('s3_bucket_name') s3_bucket_prefix = request.form.get('s3_bucket_prefix') model_name = request.form.get('model_name') - + print "s3_bucket_name: {}".format(s3_bucket_name) + print "s3_bucket_prefix: {}".format(s3_bucket_prefix) + print "model_name: {}".format(model_name) # load image from s3 image_data_path = API_helpers.download_a_dir_from_s3(s3_bucket_name, s3_bucket_prefix, local_path = TEMP_FOLDER) - + print "image_data_path:{}".format(image_data_path) # for each images in the folder # supports .png and .jpg all_image_ids = [] @@ -116,12 +118,16 @@ def retrain(): @args: train_bucket_url: URL pointing to the folder for training data on S3 @args: model_name: the name of the model want to be retraiend, the folder must be exsit """ + print "retrain line 1" s3_bucket_name = request.form.get('train_bucket_name') + print "s3 bucket name = {}".format(s3_bucket_name) s3_bucket_prefix = request.form.get('train_bucket_prefix') + print "s3 bucket prefix = {}".format(s3_bucket_prefix) nb_epoch = request.form.get('nb_epoch') batch_size = request.form.get('batch_size') model_name = s3_bucket_prefix.split('/')[-1] + print "model_name = {}".format(model_name) local_data_path = os.path.join('./tmp') # create a celer task id @@ -129,7 +135,7 @@ def retrain(): # download the folder in the url # return the path of the image files async_retrain.apply_async((model_name, - local_data_path, + # local_data_path, s3_bucket_name, s3_bucket_prefix, nb_epoch, @@ -182,9 +188,12 @@ def run_inceptionV3(): # load model name model_name = request.form.get('model_name') + print "model_name = {}, type = {}".format(model_name, type(model_name)) # load and pre-processing image img = request.files['image'] + print "image type = {}".format(type(img)) + img = image.load_img(img, target_size = (299, 299)) x = np.expand_dims(image.img_to_array(img), axis=0) x = preprocess_input(x) @@ -226,4 +235,4 @@ def run_inceptionV3(): - \ No newline at end of file + diff --git a/src/app/apis/tasks/remote_tasks.py b/src/app/apis/tasks/remote_tasks.py index 4801508..ec56668 100644 --- a/src/app/apis/tasks/remote_tasks.py +++ b/src/app/apis/tasks/remote_tasks.py @@ -12,12 +12,13 @@ def task_info(): check the progress of the remote tasks by its id """ task_ids = request.get_json() + print "task_ids = {}".format(task_ids) task_results = [] for each_id in task_ids: this_task = michaniki_celery_app.AsyncResult(each_id) this_status = str(this_task.state) - + print "this_stats = {}".format(this_status) if this_status == "SUCCESS": # get the training and validation acc this_res = this_task.get() @@ -50,4 +51,4 @@ def cancel_task(): "remote_task_id": task_id, "status": "REVOKED" } - }) \ No newline at end of file + }) diff --git a/src/app/tasks.py b/src/app/tasks.py index 009fd06..dce9915 100644 --- a/src/app/tasks.py +++ b/src/app/tasks.py @@ -32,7 +32,7 @@ def async_retrain(model_name, retrain model resume training """ - # download image data to local + # download image data to local image_data_path = API_helpers.download_a_dir_from_s3(s3_bucket_name, s3_bucket_prefix, local_path = TEMP_FOLDER) @@ -108,4 +108,4 @@ def async_transfer(model_name, shutil.rmtree(image_data_path, ignore_errors=True) raise - \ No newline at end of file + diff --git a/train.sh b/train.sh new file mode 100644 index 0000000..dda2279 --- /dev/null +++ b/train.sh @@ -0,0 +1,9 @@ +curl -X POST \ + http://127.0.0.1:3031/inceptionV3/transfer \ + -H 'Cache-Control: no-cache' \ + -H 'Postman-Token: 4e90e1d6-de18-4501-a82c-f8a878616b12' \ + -H 'content-type: multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW' \ + -F train_bucket_name=insightai2019 \ + -F train_bucket_prefix=models/tomato_potato + +