-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_valse_vlm_decoders.py
185 lines (159 loc) · 8.7 KB
/
run_valse_vlm_decoders.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import time, sys
import torch
print("Cuda is available:", torch.cuda.is_available())
from accelerate import Accelerator
import pandas as pd
import json
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoProcessor, LlavaForConditionalGeneration, LlavaNextForConditionalGeneration, AutoConfig
from PIL import Image
import random, os
from tqdm import tqdm
from read_datasets import read_data
from generation_and_prompting import *
from mm_shap_cc_shap import *
from other_faith_tests import *
from config import *
torch.cuda.empty_cache()
accelerator = Accelerator()
accelerator.free_memory()
from transformers.utils import logging
logging.set_verbosity_error()
import logging
logging.getLogger('shap').setLevel(logging.ERROR)
random.seed(42)
t1 = time.time()
c_task = sys.argv[1]
model_name = sys.argv[2]
save_json = int(sys.argv[3])
data_root = sys.argv[4]
# load model
if "mplug" in model_name:
config = AutoConfig.from_pretrained(MODELS[model_name], trust_remote_code=True)
with torch.no_grad():
model = AutoModel.from_pretrained(MODELS[model_name], attn_implementation='sdpa', torch_dtype=torch.half, trust_remote_code=True).to("cuda").eval() #device_map="auto"
# elif model_name == "llava_vicuna":
# from transformers import BitsAndBytesConfig
# # specify how to quantize the model with bitsandbytes
# quantization_config = BitsAndBytesConfig(
# # load_in_8bit=True,
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# ) # 8 just load_in_8bit=True,
# with torch.no_grad():
# model = LlavaNextForConditionalGeneration.from_pretrained(MODELS[model_name], torch_dtype=torch.float16,
# low_cpu_mem_usage=True,
# use_flash_attention_2=True,
# quantization_config = quantization_config
# ) # .to("cuda") not needed for bitsandbytes anymore
else:
if model_name == "bakllava":
ModelClass = LlavaForConditionalGeneration
else:
ModelClass = LlavaNextForConditionalGeneration
with torch.no_grad():
model = ModelClass.from_pretrained(MODELS[model_name], torch_dtype=torch.float16,
low_cpu_mem_usage=True, #device_map="auto"
# use_flash_attention_2=True,
).to("cuda")
# load tokenizer
if "mplug" in model_name:
tokenizer_real = AutoTokenizer.from_pretrained(MODELS[model_name])
processor = model.init_processor(tokenizer_real)
tokenizer = {"tokenizer": tokenizer_real, "processor": processor}
else:
tokenizer = AutoProcessor.from_pretrained(MODELS[model_name])
print(f"Done loading model and tokenizer after {time.time()-t1:.2f}s.")
if __name__ == '__main__':
############################# run evals on all valse instruments
for c_task in MULT_CHOICE_DATA.keys():
if c_task != "mscoco":
res_dict = {}
formatted_samples_pairwise, formatted_samples_caption, formatted_samples_foil = [], [], []
correct_answers, wrong_answers, image_paths = [], [], []
acc_r, p_c, p_f = 0, 0, 0
count = 0
print("Preparing data...")
# read the valse data from the json files
images_path = f"{data_root}{MULT_CHOICE_DATA[c_task][0]}"
foils_path = f"{data_root}{MULT_CHOICE_DATA[c_task][1]}"
foils_data = read_data(c_task, foils_path, images_path, data_root)
for foil_id, foil in foils_data.items(): # tqdm
if c_task == 'mscoco':
# for everything other than VALSE: pretend like the sample was accepted by annotators
caption_fits = 3
else: # the subtask stems from VALSE data
caption_fits = foil['mturk']['caption'] # take only samples accepted by annotators
if caption_fits >= 2: # MTURK filtering! Use only valid set
test_img_path = os.path.join(images_path, foil["image_file"])
if c_task == 'mscoco':
confounder = random.sample(sorted(foils_data.items()), 1)[0][1]
test_sentences = [foil["caption"], confounder["caption"]]
else:
if c_task == 'plurals':
test_sentences = [foil["caption"][0], foil["foils"][0]]
else:
test_sentences = [foil["caption"], foil["foils"][0]]
# shuffle the order of caption and foil such that the correct answer is not always A
if random.choice([0, 1]) == 0:
formatted_sample_pairwise = format_example_valse_pairwise(test_sentences[0], test_sentences[1])
correct_answer, wrong_answer = 'A', 'B'
else:
formatted_sample_pairwise = format_example_valse_pairwise(test_sentences[1], test_sentences[0])
correct_answer, wrong_answer = 'B', 'A'
formatted_sample_caption = format_example_valse(test_sentences[0])
formatted_sample_foil = format_example_valse(test_sentences[1])
formatted_samples_pairwise.append(formatted_sample_pairwise)
formatted_samples_caption.append(formatted_sample_caption)
formatted_samples_foil.append(formatted_sample_foil)
correct_answers.append(correct_answer)
wrong_answers.append(wrong_answer)
image_paths.append(test_img_path)
count += 1
print("Done preparing data. Running test...")
for k, formatted_sample_pairwise, formatted_sample_caption, formatted_sample_foil, correct_answer, wrong_answer, image_path in zip(range(len(formatted_samples_pairwise)), formatted_samples_pairwise, formatted_samples_caption, formatted_samples_foil, correct_answers, wrong_answers, image_paths): # tqdm
raw_image = Image.open(image_path).convert("RGB") # read image
if c_task in MULT_CHOICE_DATA.keys():
labels = LABELS['binary']
elif c_task in OPEN_ENDED_DATA.keys():
labels = None
else:
labels = LABELS[c_task]
# t7 = time.time()
# compute model accuracy post-hoc
inp_ask_for_prediction = prompt_answer_with_input(formatted_sample_pairwise, c_task)
prediction = vlm_predict(copy.deepcopy(inp_ask_for_prediction), raw_image, model, tokenizer, c_task, labels=labels)
acc_r_sample = evaluate_prediction(prediction, correct_answer, c_task)
acc_r += acc_r_sample
inp_ask_for_prediction = prompt_answer_with_input(formatted_sample_caption, c_task)
prediction = vlm_predict(copy.deepcopy(inp_ask_for_prediction), raw_image, model, tokenizer, c_task, labels=labels)
p_c_sample = evaluate_prediction(prediction, 'A', c_task)
p_c += p_c_sample
inp_ask_for_prediction = prompt_answer_with_input(formatted_sample_foil, c_task)
prediction = vlm_predict(copy.deepcopy(inp_ask_for_prediction), raw_image, model, tokenizer, c_task, labels=labels)
p_f_sample = evaluate_prediction(prediction, 'B', c_task)
p_f += p_f_sample
# c = time.time()-t7
# print(f"A step ran for {c // 60 % 60:.2f} minutes, {c % 60:.2f} seconds.")
res_dict[f"{c_task}_{model_name}_{k}"] = {
"image_path": image_path,
"sample": formatted_sample_pairwise,
"correct_answer": correct_answer,
"post-hoc": {
"acc_r": acc_r_sample,
"p_c": p_c_sample,
"p_f": p_f_sample,
},
}
if save_json:
# save results to a json file, make results_json directory if it does not exist
if not os.path.exists('results_json_valse'):
os.makedirs('results_json_valse')
with open(f"results_json_valse/{c_task}_{model_name}_{count}_valse_eval.json", 'w') as file:
json.dump(res_dict, file)
print(f"Ran valse eval on {c_task} {count} samples with model {model_name}. Reporting accuracy metrics.\n")
print(f"acc_r % : {acc_r*100/count:.2f} ")
print(f"p_c % : {p_c*100/count:.2f} ")
print(f"p_f % : {p_f*100/count:.2f} ")
c = time.time()-t1
print(f"\nThis script ran for {c // 86400:.2f} days, {c // 3600 % 24:.2f} hours, {c // 60 % 60:.2f} minutes, {c % 60:.2f} seconds.")