InsightDataScience · aaronh2014 · Jan 23, 2019 · Jan 23, 2019 · Jan 24, 2019 · Jan 24, 2019
diff --git a/fileio.py b/fileio.py
@@ -0,0 +1,36 @@
+import glob
+
+def load_directory(path):
+    # all directory names in path are class names
+    # all files inside a directory share label
+    class_paths = glob.glob(path + '/*')
+    class_names = list(map(lambda x: os.path.split(x)[-1], class_paths))
+    file_names = {x: glob.glob(os.path.join(path,x,'*')) for x in class_names}
+    return class_names, file_names
+
+
+def pickle_results(path,file_name,data):
+    if not os.path.exists(path):
+        os.makedirs(path)
+    f = open(os.path.join(path,file_name),'w+')
+    pickle.dump(data,f)
+    f.close()
+    return True
+
+
+def file_dict_to_flat(file_dict):
+    file_list = []
+    for class_name in file_dict:
+        file_list.extend( file_dict[class_name])
+    return file_list
+
+
+def file_list_to_dict(file_list):
+    file_dict = {}
+    for f in file_list:
+        class_name = f.split('/')[-2]
+        if class_name in file_dict:
+            file_dict[class_name].append(f)
+        else:
+            file_dict[class_name] = [f]
+    return file_dict
diff --git a/hotdog.jpg b/hotdog.jpg
diff --git a/inference.sh b/inference.sh
@@ -0,0 +1,7 @@
+curl -X POST \
+  http://127.0.0.1:3031/inceptionV3/predict \
+  -H 'Cache-Control: no-cache' \
+  -H 'Postman-Token: eeedb319-2218-44b9-86eb-63a3a1f62e14' \
+  -H 'content-type: multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW' \
+  -F image=@$1 \
+  -F model_name=$2
diff --git a/labeling.py b/labeling.py
@@ -0,0 +1,145 @@
+import numpy as np
+from keras.models import load_model
+from keras.preprocessing import image
+from keras.applications.inception_v3 import InceptionV3, preprocess_input
+from sklearn.metrics import pairwise_distances_argmin_min
+
+
+
+
+def labeling_priority(data_unlabeled, trained_model,n,method='ed',data_labeled=None):
+    if method == 'random':
+        return randomly_choose_n(data_unlabeled, n)
+    if method == 'ed':
+        unlabeled_features = feature_extraction(data_unlabeled, trained_model)
+        points = pick_points_faster(unlabeled_features, trained_model, n)
+        labeled_files = [data_unlabeled[idx] for idx in points]
+        return labeled_files
+
+
+def check_label_top_n(result, label, n=1):
+    # returns True if label is in the top n, false otherwise
+    result = result['data']['prediction'][:n]
+    for r in result:
+        if label in r['label']:
+            return True
+    return False
+
+
+def choose_n_from_each_class(file_dict, n):
+    ret_dict = {}
+    for class_name in file_dict:
+        ret_dict[class_name] = []
+        for i in range(n):
+            if file_dict[class_name]:
+                ret_dict[class_name].append(file_dict[class_name][0])
+                del file_dict[class_name][0]
+    return ret_dict
+
+
+def run_inference_on_dict(file_dict, model_name='base',
+                         url='http://127.0.0.1:3031/inceptionV3/predict'):
+    results = {}
+    for class_name in file_dict:
+        results[class_name] = []
+        for dp in file_dict[class_name]:
+            results[class_name].append(query_inference_server(dp, model_name,url))
+    return results
+
+
+
+
+
+
+
+
+def choose_n(file_dict, n):
+#selects n from each class 
+    ret_dict = {}
+    for class_name in file_dict:
+        if file_dict[class_name]:
+            ret_dict[class_name] = file_dict[class_name][:n]                         
+    return ret_dict
+
+
+def randomly_choose_n(file_list, n):
+#randomly selects n total files
+    random.seed(90210)
+    return random.sample(file_list, n)
+
+
+def compute_accuracy(predictions,class_name):
+    res = predictions[class_name]
+    correct = sum(res[x]['data']['prediction'][0]['label'] == class_name
+                  for x in range(len(res)))
+    return float(correct) / len(res)
+
+
+def feature_extraction(file_names,your_model):
+#given a list of images and model, returns a list of ndarray feature weights
+    feature_list = []
+    for f in file_names:
+
+        img = image.load_img(f,target_size=(299, 299))
+
+        x = np.expand_dims(image.img_to_array(img), axis=0)
+        x = preprocess_input(x)
+
+        feature = your_model.predict(x)
+        feature_np = np.array(feature)
+        feature_list.append(feature_np.flatten())
+
+    return feature_list
+
+
+def pick_points_faster(unlabeled_features, model, n,labeled_features=[]):
+    from sklearn.metrics import pairwise_distances_argmin_min
+    from sklearn.metrics.pairwise  import euclidean_distances as ed
+    labels = []
+    if len(labeled_eatures) == 0:
+        labeled_features = [unlabeled_features[0]]
+        labels.append(0)
+        n = n - 1
+
+    indices, values = pairwise_distances_argmin_min(unlabeled_features,
+                                                        labeled_features)
+    for i in range(n):
+
+        max_of_min = np.argmax(values)
+        printme = values[max_of_min]
+        labeled_features.append(unlabeled_features[max_of_min])
+        labels.append(max_of_min)
+        indices, values_new = pairwise_distances_argmin_min(unlabeled_features,
+                                                        [labeled_features[-1]])
+        for j in range(len(unlabeled_features)):
+            if values_new[j] < values[j]:
+                values[j] = values_new[j]
+
+    return labels 
+
+
+def cluster_label(file_names,your_model,n):
+    feature_list = []
+    for f in file_names:
+        img = image.load_img(f,target_size=(299, 299))
+
+        x = np.expand_dims(image.img_to_array(img), axis=0)
+        x = preprocess_input(x)
+#        x = x.copy(order="C")
+
+        feature = your_model.predict(x)
+        feature_np = np.array(feature)
+        feature_list.append(feature_np.flatten())
+
+    feature_list_np = np.array(feature_list)
+    kmeans = KMeans(n_clusters=n, random_state=0).fit(feature_list_np)
+    return kmeans
+
+
+
+
+#keras
+#requests
+#boto3
+#tensorflow
+#sklearn.metrics
diff --git a/main.py b/main.py
@@ -0,0 +1,124 @@
+import boto3
+import requests
+import os
+import random
+import shutil
+import pickle
+import time
+from fileio import *
+from sherlockWrapper import *
+from labeling import *
+
+def random_images_loop(model_name, file_loc, base_model='inceptionV3', N_initial=100,
+                     bucket='insightai2019', ip_addr='http://127.0.0.1:3031/'):
+    #main body for running random 
+    output_path = './results/' + model_name
+    transfer_url = ip_addr + base_model + '/transfer'
+    inference_url = ip_addr + base_model + '/predict'
+    status_url = ip_addr + 'tasks/info'
+
+    class_names, file_names = loadDirectory('./' + file_loc + '/train/')
+    validate_class_names, validate_file_names = loadDirectory('./' +
+                                                             file_loc + '/val/')
+    test_class_names, test_file_names = loadDirectory('./' + file_loc + '/test/')
+
+    train_dict = choose_n(file_names, N_initial)
+
+    upload_to_S3(train_dict,os.path.join('models',model_name,'train'))
+    upload_to_S3(random_file_dict,os.path.join('models',model_name,'train'))    
+    upload_to_S3(validate_file_names, os.path.join('models',model_name,'val'))
+
+    r = train_new_model(model_name, bucket_name='insightai2019', path_prefix='models',
+                         url=transfer_url)
+    wait_for_training(r)
+    rid = r['task_id']
+    response = requests.post(status_url,json={rid:rid})
+    r_acc = response.json()
+    test_random = run_inference_on_dict(test_file_names, model_name)
+    acc_random = []
+    for k in test_random:
+        acc_random.append(compute_accuracy(test_random,k))
+
+    save_file_name = 'r{}.pickle'.format(0)
+    pickle_results(output_path, save_file_name, [r_acc,test_random])
+    return r_acc, test_random
+
+
+def non_random_images_loop(model_name, file_loc, base_model='inceptionV3', N_initial=100,
+                     bucket='insightai2019', ip_addr='http://127.0.0.1:3031'):
+#    model_name = 'imgnet11.maxpool.584'
+#    file_loc = 'imgnetmodel'
+#    base_model = 'inceptionV3'
+#    N_initial = 584
+#    bucket = 'insightai2019'
+#    ip_addr='http://127.0.0.1:3031/'
+
+    output_path = './results/' + model_name
+    transfer_url = ip_addr + base_model + '/transfer'
+    inference_url = ip_addr + base_model + '/predict'
+    status_url = ip_addr + 'tasks/info'
+    retrain_url=ip_addr + 'inceptionV3/retrain'
+
+#    iv3 = InceptionV3(weights='imagenet',input_shape=(299,299,3))
+    iv3_topless = InceptionV3(include_top=False, weights='imagenet',pooling=max,
+                              input_shape=(299,299,3))
+
+    class_names, file_names = loadDirectory('./' + file_loc + '/train/')
+    validate_class_names, validate_file_names = loadDirectory('./' +
+                                                             file_loc + '/val/')
+    class_names, test_file_names = loadDirectory('./' + file_loc + '/test/')
+
+    file_list = []
+    file_labels = []
+    for k in file_names:
+        file_list.extend(file_names[k])
+        file_labels.extend([k] * len(file_names[k]))
+
+    unlabeled_features = feature_extraction(file_list, iv3_topless)
+    points = pick_points_faster(unlabeled_features, iv3_topless, N_initial)
+    labeled_files = [file_list[idx] for idx in points]
+
+    upload_dict = {k :[] for k in class_names}
+    for idx in points:
+        upload_dict[file_labels[idx]].append(file_list[idx])
+    upload_to_S3(upload_dict,os.path.join('models',model_name,'train'))
+    upload_to_S3(validate_file_names, os.path.join('models',model_name,'val'))
+
+    r = train_new_model(model_name, bucket_name='insightai2019', path_prefix='models',
+                         url=transfer_url)
+    wait_for_training(r)
+    rid = r['task_id']
+    response = requests.post(status_url,json={rid:rid})
+    train_acc = response.json()#83.6 training, 81.6 validation
+    test_results = runInferenceOnDict(test_file_names, model_name)
+    test_acc = []
+    for k in test_results:
+        test_acc.append(computeAccuracy(test_results,k))
+
+    return train_acc, test_acc
+
+
+def main(model_name, base_model='inceptionV3', N_initial=5,
+         iterations=1, labelsPerRound=5, bucket='insightai2019',
+         ip_addr='http://127.0.0.1:3031/'):
+
+    model_name = 'HotWineBike1kRandom'
+    base_model = 'inceptionV3'
+    N_initial = 25
+    iterations = 1
+    labelsPerRound = 25
+    bucket = 'insightai2019'
+    output_path = './results/' + model_name
+    ip_addr='http://127.0.0.1:3031/'
+    transfer_url = ip_addr + base_model + '/transfer'
+    inference_url = ip_addr + base_model + '/predict'
+    status_url = ip_addr + 'tasks/info'
+    retrain_url=ip_addr + 'inceptionV3/retrain'
+    iv3 = InceptionV3(weights='imagenet',input_shape=(299,299,3))
+    iv3_topless = InceptionV3(include_top=False, weights='imagenet',input_shape=(299,299,3))
+    # load the images - array of Images
+
+
+if __name__ == '__main__':
+    main('tomato_potato')
+
diff --git a/misc.py b/misc.py
@@ -0,0 +1,17 @@
+def train_test_validate(path, model_path, n_test=50, n_val=50,
+                      names=['test', 'val', 'train'] ):
+    class_name = os.path.split(path)[-1]
+    file_names = glob.glob(os.path.join(path,'*'))
+    destinations = map(lambda x: os.path.join(model_path, x,
+                                              class_name), names)
+    random.shuffle(file_names)
+    for d in destinations:
+        if not os.path.isdir(d):
+            os.makedirs(d)
+
+    map(lambda x: shutil.move(x, os.path.join(destinations[0],
+                            os.path.split(x)[-1])), file_names[0:n_test] )
+    map(lambda x: shutil.move(x, os.path.join(destinations[1],
+                        os.path.split(x)[-1])), file_names[nTest:n_test + n_val] )
+    map(lambda x: shutil.move(x, os.path.join(destinations[2],
+                            os.path.split(x)[-1])), file_names[n_test + n_val:] )
diff --git a/sherlockWrapper.py b/sherlockWrapper.py
@@ -0,0 +1,57 @@
+def query_inference_server(fileName, model_name='base',
+                         url='http://127.0.0.1:3031/inceptionV3/predict'):
+    form_data = {'model_name': model_name}
+    files = {'image': open(fileName, 'rb')}
+    response = requests.post(url, files=files, data=form_data)
+    return response.json()
+
+
+def train_new_model(model_name, bucket_name='insightai2019', path_prefix='models',
+                         url='http://127.0.0.1:3031/inceptionV3/transfer'):
+    form_data = {
+        'train_bucket_name': bucket_name, 
+        'train_bucket_prefix': os.path.join(path_prefix, model_name)
+    }
+
+    response = requests.post(url, data=form_data)
+    return response.json()
+
+
+def check_status(id, url='http://127.0.0.1:3031/tasks/info'):
+    response = requests.post(url,json={id:id})
+    return response.json()['Tasks Status'][0]['status'] == 'SUCCESS'
+
+
+def retrain_model(model_name, path, bucket_name='insightai2019',
+                 nb_epoch=3, batch_size=2,
+                 url='http://127.0.0.1:3031/inceptionV3/retrain'):
+    form_data = {
+        'nb_epoch': nb_epoch,
+        'batch_size': batch_size,
+        'train_bucket_name': bucket_name,
+        'train_bucket_prefix': os.path.join(path, model_name)
+    }
+
+    response = requests.post(url, data=form_data)
+    return response.json()
+
+
+def upload_to_S3(file_dict, key_path, bucket_name='insightai2019'):
+    # push all files in file_dict to S3
+    s3 = boto3.client('s3')
+    for key in file_dict:
+        for datapoint in file_dict[key]:
+            fname= os.path.split(datapoint)[-1]
+            file_key = os.path.join(key_path,key,fname)
+            print file_key
+            s3.upload_file(datapoint, bucket_name, file_key)
+
+
+def wait_for_training(response, t=20, t_max=900,
+                      url='http://127.0.0.1:3031/tasks/info'):
+    status = check_status(response['task_id'],url)
+    while not status:
+        time.sleep(t)
+        t += t / 10
+        status = check_status(response['task_id'],url)
+    return 1