Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Labeling #5

Open
wants to merge 42 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
0367222
initial commit
aaronh2014 Jan 23, 2019
2db1285
added potato-tomato
aaronh2014 Jan 23, 2019
857f239
outline of process
aaronh2014 Jan 24, 2019
205e51a
saving chanes before switching branches
aaronh2014 Jan 24, 2019
7b2e950
more notes on workflow
aaronh2014 Jan 25, 2019
431da4f
parsed output, selected initial training set
aaronh2014 Jan 29, 2019
d8ae7e2
trapped an error in s3 download, added scripts to call curls commands
aaronh2014 Jan 30, 2019
85c8732
removed debugging info from API_helpers
aaronh2014 Jan 30, 2019
40c25dc
fixed a space, removed another print statement
aaronh2014 Jan 30, 2019
ccc637a
save progress before shutting down...requests still not working
aaronh2014 Jan 30, 2019
72155b3
added model as a parameter to inference script
aaronh2014 Jan 30, 2019
714d54c
added queryInferenceServer
aaronh2014 Jan 30, 2019
95e632f
commiting to so i can delete rather than lookup syntax
aaronh2014 Jan 30, 2019
8cf9154
Delete hello_world.py
aaronh2014 Jan 30, 2019
eafbb1f
added uploadToS3
aaronh2014 Jan 30, 2019
278cd29
Merge branch 'labeling' of https://github.com/InsightDataCommunity/Sh…
aaronh2014 Jan 30, 2019
1e7e50d
added uploadToS3
aaronh2014 Jan 30, 2019
8fbc7fc
added transfer learning and retraining calls
aaronh2014 Jan 31, 2019
1e2b893
pipeline completed, ready to run on larger datasets. added many new f…
aaronh2014 Feb 5, 2019
7c1a927
updated wait time to reduce spam...maybe useful to make it smarter still
aaronh2014 Feb 5, 2019
b50979f
tested for bad images
aaronh2014 Feb 6, 2019
c70021e
print statements aren't necessary, but removing local_data_path fixes…
aaronh2014 Feb 6, 2019
65dc60b
added print statements for task_ids
aaronh2014 Feb 6, 2019
dbca012
no change except white space
aaronh2014 Feb 6, 2019
1d80e8a
fixed pickle error
aaronh2014 Feb 6, 2019
53b98bf
ensure at least one pic of each class each round of retraining. some…
aaronh2014 Feb 6, 2019
ac72ea8
snapshot before refactor
aaronh2014 Feb 7, 2019
04995eb
adding more urls
aaronh2014 Feb 7, 2019
2bdc8cc
fixed )
aaronh2014 Feb 7, 2019
b0f905e
red
aaronh2014 Feb 7, 2019
a4c8ea4
split file selection / training / inference
aaronh2014 Feb 8, 2019
b3d9a5b
checking sync between machines
aaronh2014 Feb 10, 2019
886846a
Added kmeans clustering
aaronh2014 Feb 10, 2019
6788246
pushing cluster based approach
aaronh2014 Feb 11, 2019
7369c8d
cleaning up codebase rd 1
aaronh2014 Feb 11, 2019
ee6ef05
sync
aaronh2014 Feb 11, 2019
296b1cc
added '/'
aaronh2014 Feb 12, 2019
b05effd
sync
aaronh2014 Feb 14, 2019
96e4e29
running another experiment
aaronh2014 Feb 14, 2019
4665849
-n
aaronh2014 Feb 14, 2019
178c0c9
split labeling into main/sherlockwrapper/fileio/misc
aaronh2014 Feb 22, 2019
f348ee5
cleaning up
aaronh2014 Feb 22, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions fileio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import glob

def load_directory(path):
# all directory names in path are class names
# all files inside a directory share label
class_paths = glob.glob(path + '/*')
class_names = list(map(lambda x: os.path.split(x)[-1], class_paths))
file_names = {x: glob.glob(os.path.join(path,x,'*')) for x in class_names}
return class_names, file_names


def pickle_results(path,file_name,data):
if not os.path.exists(path):
os.makedirs(path)
f = open(os.path.join(path,file_name),'w+')
pickle.dump(data,f)
f.close()
return True


def file_dict_to_flat(file_dict):
file_list = []
for class_name in file_dict:
file_list.extend( file_dict[class_name])
return file_list


def file_list_to_dict(file_list):
file_dict = {}
for f in file_list:
class_name = f.split('/')[-2]
if class_name in file_dict:
file_dict[class_name].append(f)
else:
file_dict[class_name] = [f]
return file_dict
Binary file added hotdog.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 7 additions & 0 deletions inference.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
curl -X POST \
http://127.0.0.1:3031/inceptionV3/predict \
-H 'Cache-Control: no-cache' \
-H 'Postman-Token: eeedb319-2218-44b9-86eb-63a3a1f62e14' \
-H 'content-type: multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW' \
-F image=@$1 \
-F model_name=$2
145 changes: 145 additions & 0 deletions labeling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import numpy as np
from keras.models import load_model
from keras.preprocessing import image
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from sklearn.metrics import pairwise_distances_argmin_min




def labeling_priority(data_unlabeled, trained_model,n,method='ed',data_labeled=None):
if method == 'random':
return randomly_choose_n(data_unlabeled, n)
if method == 'ed':
unlabeled_features = feature_extraction(data_unlabeled, trained_model)
points = pick_points_faster(unlabeled_features, trained_model, n)
labeled_files = [data_unlabeled[idx] for idx in points]
return labeled_files


def check_label_top_n(result, label, n=1):
# returns True if label is in the top n, false otherwise
result = result['data']['prediction'][:n]
for r in result:
if label in r['label']:
return True
return False


def choose_n_from_each_class(file_dict, n):
ret_dict = {}
for class_name in file_dict:
ret_dict[class_name] = []
for i in range(n):
if file_dict[class_name]:
ret_dict[class_name].append(file_dict[class_name][0])
del file_dict[class_name][0]
return ret_dict


def run_inference_on_dict(file_dict, model_name='base',
url='http://127.0.0.1:3031/inceptionV3/predict'):
results = {}
for class_name in file_dict:
results[class_name] = []
for dp in file_dict[class_name]:
results[class_name].append(query_inference_server(dp, model_name,url))
return results








def choose_n(file_dict, n):
#selects n from each class
ret_dict = {}
for class_name in file_dict:
if file_dict[class_name]:
ret_dict[class_name] = file_dict[class_name][:n]
return ret_dict


def randomly_choose_n(file_list, n):
#randomly selects n total files
random.seed(90210)
return random.sample(file_list, n)


def compute_accuracy(predictions,class_name):
res = predictions[class_name]
correct = sum(res[x]['data']['prediction'][0]['label'] == class_name
for x in range(len(res)))
return float(correct) / len(res)


def feature_extraction(file_names,your_model):
#given a list of images and model, returns a list of ndarray feature weights
feature_list = []
for f in file_names:

img = image.load_img(f,target_size=(299, 299))

x = np.expand_dims(image.img_to_array(img), axis=0)
x = preprocess_input(x)

feature = your_model.predict(x)
feature_np = np.array(feature)
feature_list.append(feature_np.flatten())

return feature_list


def pick_points_faster(unlabeled_features, model, n,labeled_features=[]):
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.metrics.pairwise import euclidean_distances as ed
labels = []
if len(labeled_eatures) == 0:
labeled_features = [unlabeled_features[0]]
labels.append(0)
n = n - 1

indices, values = pairwise_distances_argmin_min(unlabeled_features,
labeled_features)
for i in range(n):

max_of_min = np.argmax(values)
printme = values[max_of_min]
labeled_features.append(unlabeled_features[max_of_min])
labels.append(max_of_min)
indices, values_new = pairwise_distances_argmin_min(unlabeled_features,
[labeled_features[-1]])
for j in range(len(unlabeled_features)):
if values_new[j] < values[j]:
values[j] = values_new[j]

return labels


def cluster_label(file_names,your_model,n):
feature_list = []
for f in file_names:
img = image.load_img(f,target_size=(299, 299))

x = np.expand_dims(image.img_to_array(img), axis=0)
x = preprocess_input(x)
# x = x.copy(order="C")

feature = your_model.predict(x)
feature_np = np.array(feature)
feature_list.append(feature_np.flatten())

feature_list_np = np.array(feature_list)
kmeans = KMeans(n_clusters=n, random_state=0).fit(feature_list_np)
return kmeans




#keras
#requests
#boto3
#tensorflow
#sklearn.metrics
124 changes: 124 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import boto3
import requests
import os
import random
import shutil
import pickle
import time
from fileio import *
from sherlockWrapper import *
from labeling import *

def random_images_loop(model_name, file_loc, base_model='inceptionV3', N_initial=100,
bucket='insightai2019', ip_addr='http://127.0.0.1:3031/'):
#main body for running random
output_path = './results/' + model_name
transfer_url = ip_addr + base_model + '/transfer'
inference_url = ip_addr + base_model + '/predict'
status_url = ip_addr + 'tasks/info'

class_names, file_names = loadDirectory('./' + file_loc + '/train/')
validate_class_names, validate_file_names = loadDirectory('./' +
file_loc + '/val/')
test_class_names, test_file_names = loadDirectory('./' + file_loc + '/test/')

train_dict = choose_n(file_names, N_initial)

upload_to_S3(train_dict,os.path.join('models',model_name,'train'))
upload_to_S3(random_file_dict,os.path.join('models',model_name,'train'))
upload_to_S3(validate_file_names, os.path.join('models',model_name,'val'))

r = train_new_model(model_name, bucket_name='insightai2019', path_prefix='models',
url=transfer_url)
wait_for_training(r)
rid = r['task_id']
response = requests.post(status_url,json={rid:rid})
r_acc = response.json()
test_random = run_inference_on_dict(test_file_names, model_name)
acc_random = []
for k in test_random:
acc_random.append(compute_accuracy(test_random,k))

save_file_name = 'r{}.pickle'.format(0)
pickle_results(output_path, save_file_name, [r_acc,test_random])
return r_acc, test_random


def non_random_images_loop(model_name, file_loc, base_model='inceptionV3', N_initial=100,
bucket='insightai2019', ip_addr='http://127.0.0.1:3031'):
# model_name = 'imgnet11.maxpool.584'
# file_loc = 'imgnetmodel'
# base_model = 'inceptionV3'
# N_initial = 584
# bucket = 'insightai2019'
# ip_addr='http://127.0.0.1:3031/'

output_path = './results/' + model_name
transfer_url = ip_addr + base_model + '/transfer'
inference_url = ip_addr + base_model + '/predict'
status_url = ip_addr + 'tasks/info'
retrain_url=ip_addr + 'inceptionV3/retrain'

# iv3 = InceptionV3(weights='imagenet',input_shape=(299,299,3))
iv3_topless = InceptionV3(include_top=False, weights='imagenet',pooling=max,
input_shape=(299,299,3))

class_names, file_names = loadDirectory('./' + file_loc + '/train/')
validate_class_names, validate_file_names = loadDirectory('./' +
file_loc + '/val/')
class_names, test_file_names = loadDirectory('./' + file_loc + '/test/')

file_list = []
file_labels = []
for k in file_names:
file_list.extend(file_names[k])
file_labels.extend([k] * len(file_names[k]))

unlabeled_features = feature_extraction(file_list, iv3_topless)
points = pick_points_faster(unlabeled_features, iv3_topless, N_initial)
labeled_files = [file_list[idx] for idx in points]

upload_dict = {k :[] for k in class_names}
for idx in points:
upload_dict[file_labels[idx]].append(file_list[idx])
upload_to_S3(upload_dict,os.path.join('models',model_name,'train'))
upload_to_S3(validate_file_names, os.path.join('models',model_name,'val'))

r = train_new_model(model_name, bucket_name='insightai2019', path_prefix='models',
url=transfer_url)
wait_for_training(r)
rid = r['task_id']
response = requests.post(status_url,json={rid:rid})
train_acc = response.json()#83.6 training, 81.6 validation
test_results = runInferenceOnDict(test_file_names, model_name)
test_acc = []
for k in test_results:
test_acc.append(computeAccuracy(test_results,k))

return train_acc, test_acc


def main(model_name, base_model='inceptionV3', N_initial=5,
iterations=1, labelsPerRound=5, bucket='insightai2019',
ip_addr='http://127.0.0.1:3031/'):

model_name = 'HotWineBike1kRandom'
base_model = 'inceptionV3'
N_initial = 25
iterations = 1
labelsPerRound = 25
bucket = 'insightai2019'
output_path = './results/' + model_name
ip_addr='http://127.0.0.1:3031/'
transfer_url = ip_addr + base_model + '/transfer'
inference_url = ip_addr + base_model + '/predict'
status_url = ip_addr + 'tasks/info'
retrain_url=ip_addr + 'inceptionV3/retrain'
iv3 = InceptionV3(weights='imagenet',input_shape=(299,299,3))
iv3_topless = InceptionV3(include_top=False, weights='imagenet',input_shape=(299,299,3))
# load the images - array of Images


if __name__ == '__main__':
main('tomato_potato')

17 changes: 17 additions & 0 deletions misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
def train_test_validate(path, model_path, n_test=50, n_val=50,
names=['test', 'val', 'train'] ):
class_name = os.path.split(path)[-1]
file_names = glob.glob(os.path.join(path,'*'))
destinations = map(lambda x: os.path.join(model_path, x,
class_name), names)
random.shuffle(file_names)
for d in destinations:
if not os.path.isdir(d):
os.makedirs(d)

map(lambda x: shutil.move(x, os.path.join(destinations[0],
os.path.split(x)[-1])), file_names[0:n_test] )
map(lambda x: shutil.move(x, os.path.join(destinations[1],
os.path.split(x)[-1])), file_names[nTest:n_test + n_val] )
map(lambda x: shutil.move(x, os.path.join(destinations[2],
os.path.split(x)[-1])), file_names[n_test + n_val:] )
57 changes: 57 additions & 0 deletions sherlockWrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
def query_inference_server(fileName, model_name='base',
url='http://127.0.0.1:3031/inceptionV3/predict'):
form_data = {'model_name': model_name}
files = {'image': open(fileName, 'rb')}
response = requests.post(url, files=files, data=form_data)
return response.json()


def train_new_model(model_name, bucket_name='insightai2019', path_prefix='models',
url='http://127.0.0.1:3031/inceptionV3/transfer'):
form_data = {
'train_bucket_name': bucket_name,
'train_bucket_prefix': os.path.join(path_prefix, model_name)
}

response = requests.post(url, data=form_data)
return response.json()


def check_status(id, url='http://127.0.0.1:3031/tasks/info'):
response = requests.post(url,json={id:id})
return response.json()['Tasks Status'][0]['status'] == 'SUCCESS'


def retrain_model(model_name, path, bucket_name='insightai2019',
nb_epoch=3, batch_size=2,
url='http://127.0.0.1:3031/inceptionV3/retrain'):
form_data = {
'nb_epoch': nb_epoch,
'batch_size': batch_size,
'train_bucket_name': bucket_name,
'train_bucket_prefix': os.path.join(path, model_name)
}

response = requests.post(url, data=form_data)
return response.json()


def upload_to_S3(file_dict, key_path, bucket_name='insightai2019'):
# push all files in file_dict to S3
s3 = boto3.client('s3')
for key in file_dict:
for datapoint in file_dict[key]:
fname= os.path.split(datapoint)[-1]
file_key = os.path.join(key_path,key,fname)
print file_key
s3.upload_file(datapoint, bucket_name, file_key)


def wait_for_training(response, t=20, t_max=900,
url='http://127.0.0.1:3031/tasks/info'):
status = check_status(response['task_id'],url)
while not status:
time.sleep(t)
t += t / 10
status = check_status(response['task_id'],url)
return 1
Loading