Skip to content

Commit

Permalink
Add test
Browse files Browse the repository at this point in the history
  • Loading branch information
KenelmQLH committed Mar 2, 2024
1 parent df6ae97 commit 84b79c7
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 75 deletions.
8 changes: 4 additions & 4 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from longling import path_append
from EduData import get_data
from ..Tokenizer import Tokenizer, get_tokenizer
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, AutoTokenizer, DisenQTokenizer, QuesNetTokenizer, Question
from EduNLP.Pretrain import ElmoTokenizer, BertTokenizer, HfAutoTokenizer, DisenQTokenizer, QuesNetTokenizer, Question
from EduNLP import logger

__all__ = ["I2V", "D2V", "W2V", "Elmo", "Bert", "HfAuto", "DisenQ", "QuesNet", "get_pretrained_i2v"]
Expand Down Expand Up @@ -70,7 +70,7 @@ def __init__(self, tokenizer, t2v, *args, tokenizer_kwargs: dict = None,
self.tokenizer = BertTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
elif tokenizer == 'hf_auto':
self.tokenizer = AutoTokenizer.from_pretrained(
self.tokenizer = HfAutoTokenizer.from_pretrained(
**tokenizer_kwargs if tokenizer_kwargs is not None else {})
elif tokenizer == 'quesnet':
self.tokenizer = QuesNetTokenizer.from_pretrained(
Expand Down Expand Up @@ -655,6 +655,6 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR, device='cpu'):
raise KeyError(
"Unknown model name %s, use one of the provided models: %s" % (name, ", ".join(pretrained_models))
)
_, t2v = get_pretrained_model_info(name)
_class, *params = MODEL_MAP[t2v], name
_, i2v = get_pretrained_model_info(name)
_class, *params = MODEL_MAP[i2v], name
return _class.from_pretrained(*params, model_dir=model_dir, device=device)
34 changes: 17 additions & 17 deletions EduNLP/Pretrain/auto_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
from .hugginface_utils import TokenizerForHuggingface

__all__ = [
"AutoTokenizer",
"AutoDataset",
"HfAutoTokenizer",
"HfAutoDataset",
"pretrain_hf_auto_model",
"finetune_hf_auto_model_for_property_prediction",
"finetune_hf_auto_model_for_knowledge_prediction",
Expand Down Expand Up @@ -42,11 +42,11 @@
}


class AutoTokenizer(TokenizerForHuggingface):
class HfAutoTokenizer(TokenizerForHuggingface):
"""
Examples
----------
>>> tokenizer = AutoTokenizer(add_special_tokens=True)
>>> tokenizer = HfAutoTokenizer(add_special_tokens=True)
>>> item = "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"
>>> token_item = tokenizer(item)
Expand All @@ -63,13 +63,13 @@ class AutoTokenizer(TokenizerForHuggingface):
>>> print(len(tokenizer.tokenize(items)))
2
>>> tokenizer.save_pretrained('test_dir') # doctest: +SKIP
>>> tokenizer = AutoTokenizer.from_pretrained('test_dir') # doctest: +SKIP
>>> tokenizer = HfAutoTokenizer.from_pretrained('test_dir') # doctest: +SKIP
"""

pass


class AutoDataset(EduDataset):
class HfAutoDataset(EduDataset):
pass


Expand Down Expand Up @@ -114,22 +114,22 @@ def pretrain_hf_auto_model(
train_params = train_params if train_params is not None else {}
# tokenizer configuration
if os.path.exists(pretrained_model):
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
else:
work_tokenizer_params = {
"add_specials": True,
"tokenize_method": "pure_text",
}
work_tokenizer_params.update(tokenizer_params)
tokenizer = AutoTokenizer(pretrained_model, **work_tokenizer_params)
tokenizer = HfAutoTokenizer(pretrained_model, **work_tokenizer_params)
# TODO: tokenizer.set_vocab()
# model configuration
model = AutoModelForMaskedLM.from_pretrained(pretrained_model, **model_params)
# resize embedding for additional special tokens
model.resize_token_embeddings(len(tokenizer.bert_tokenizer))

# dataset configuration
dataset = AutoDataset(
dataset = HfAutoDataset(
tokenizer, items=items, stem_key=data_params.get("stem_key", None)
)
mlm_probability = train_params.pop("mlm_probability", 0.15)
Expand Down Expand Up @@ -187,16 +187,16 @@ def finetune_hf_auto_model_for_property_prediction(
model_params = model_params if model_params is not None else {}
train_params = train_params if train_params is not None else {}
# tokenizer configuration
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
# dataset configuration
train_dataset = AutoDataset(
train_dataset = HfAutoDataset(
tokenizer=tokenizer,
items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "difficulty"),
)
if eval_items is not None:
eval_dataset = AutoDataset(
eval_dataset = HfAutoDataset(
tokenizer=tokenizer,
items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
Expand All @@ -206,7 +206,7 @@ def finetune_hf_auto_model_for_property_prediction(
eval_dataset = None
# model configuration
model = HfModelForPropertyPrediction(pretrained_model, **model_params)
model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))
model.model.resize_token_embeddings(len(tokenizer.bert_tokenizer))
# training configuration
work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS)
work_train_params["output_dir"] = output_dir
Expand Down Expand Up @@ -262,16 +262,16 @@ def finetune_hf_auto_model_for_knowledge_prediction(
model_params = model_params if model_params is not None else {}
train_params = train_params if train_params is not None else {}
# tokenizer configuration
tokenizer = AutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
# dataset configuration
train_dataset = AutoDataset(
train_dataset = HfAutoDataset(
tokenizer=tokenizer,
items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "know_list"),
)
if eval_items is not None:
eval_dataset = AutoDataset(
eval_dataset = HfAutoDataset(
tokenizer=tokenizer,
items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
Expand All @@ -281,7 +281,7 @@ def finetune_hf_auto_model_for_knowledge_prediction(
eval_dataset = None
# model configuration
model = HfModelForKnowledgePrediction(pretrained_model_dir=pretrained_model, **model_params)
model.bert.resize_token_embeddings(len(tokenizer.bert_tokenizer))
model.model.resize_token_embeddings(len(tokenizer.bert_tokenizer))
# training configuration
work_train_params = deepcopy(DEFAULT_TRAIN_PARAMS)
work_train_params["output_dir"] = output_dir
Expand Down
36 changes: 1 addition & 35 deletions EduNLP/Pretrain/quesnet_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,57 +383,23 @@ class QuesnetDataset(Dataset):
def __init__(
self,
items=None,
filename: str = None,
tokenizer: str = None,
content_key=lambda x: x["ques_content"],
answer_key=lambda x: x["ques_answer"],
option_key=lambda x: x["ques_options"],
pipeline=None,
skip=0
):
self.filename = filename
self.skip = skip
self.content_key = content_key
self.answer_key = answer_key
self.option_key = option_key
self.pipeline = pipeline
if items is None and filename is not None:
self.load_data_lines()
else:
self.lines = items
self.lines = items

self.tokenizer = tokenizer
self.meta = tokenizer.meta

def load_data_lines(self):
"""Read data by row from a JSON file
Important: the data file is loaded during initialization.
"""

# TODO: All data is read into memory without chunking.
# This may lead to low efficiency.
data_dir = self.filename
skip = self.skip # Read from Line skip + 1.
self.lines = []
self.length = 0

with open(data_dir, "r", encoding="utf-8") as f:
row = 0
while True:
row += 1
line = f.readline()
if row <= skip:
continue
if not line:
break
self.lines.append(json.loads(line.strip()))

self.length = row - skip - 1
assert (
self.length > 0
), f"{data_dir} is empty. Or file length is less than skip length."

def __len__(self):
return len(self.lines)

Expand Down
2 changes: 1 addition & 1 deletion EduNLP/Vector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from .t2v import T2V, get_pretrained_t2v, get_pretrained_model_info, get_all_pretrained_models
from .embedding import Embedding
from .bert_vec import BertModel
from .auto_vec import AutoModel
from .auto_vec import HfAutoModel
from .quesnet import QuesNetModel
from .disenqnet import DisenQModel
from .elmo_vec import ElmoModel
28 changes: 16 additions & 12 deletions tests/test_pretrain/test_pretrained_hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
import torch
from EduNLP.ModelZoo.hf_model import HfModelForPropertyPrediction, HfModelForKnowledgePrediction
from transformers import AutoModel
from EduNLP.Pretrain import AutoTokenizer, pretrain_hf_auto_model
from EduNLP.Pretrain import finetune_hf_auto_model_for_property_prediction, finetune_hf_auto_model_for_knowledge_prediction
from EduNLP.Pretrain import HfAutoTokenizer, pretrain_hf_auto_model
from EduNLP.Pretrain import (
finetune_hf_auto_model_for_property_prediction,
finetune_hf_auto_model_for_knowledge_prediction
)
from EduNLP.Vector import T2V, HfAutoModel
from EduNLP.I2V import HfAuto, get_pretrained_i2v

Expand All @@ -25,15 +28,16 @@ def test_tokenizer(self, standard_luna_data, pretrained_tokenizer_dir):
"granularity": "char",
# "stopwords": None,
}
tokenizer = AutoTokenizer(pretrained_model="bert-base-chinese", add_specials=True,
tokenize_method="ast_formula", text_params=text_params)

tokenizer = HfAutoTokenizer(pretrained_model="bert-base-chinese",
add_specials=True,
tokenize_method="ast_formula",
text_params=text_params)
tokenizer_size1 = len(tokenizer)
tokenizer.set_vocab(standard_luna_data, key=lambda x: x["ques_content"])
tokenizer_size2 = len(tokenizer)
assert tokenizer_size1 < tokenizer_size2
tokenizer.save_pretrained(pretrained_tokenizer_dir)
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_dir)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_tokenizer_dir)
tokenizer_size3 = len(tokenizer)
assert tokenizer_size2 == tokenizer_size3
tokens = tokenizer.tokenize(test_items, key=lambda x: x["ques_content"])
Expand All @@ -48,7 +52,7 @@ def test_tokenizer(self, standard_luna_data, pretrained_tokenizer_dir):
assert isinstance(res["input_ids"], list)

def test_train_model(self, standard_luna_data, pretrained_model_dir, pretrained_tokenizer_dir):
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_dir)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_tokenizer_dir)
items = [
{'ques_content': '有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\
如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,\
Expand All @@ -70,7 +74,7 @@ def test_train_model(self, standard_luna_data, pretrained_model_dir, pretrained_
}
)
model = AutoModel.from_pretrained(pretrained_model_dir)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model_dir)

encodes = tokenizer(items[0], lambda x: x['ques_content'])
model(**encodes)
Expand Down Expand Up @@ -122,7 +126,7 @@ def test_train_pp(self, standard_luna_data, pretrained_pp_dir, pretrained_model_
data_params=data_params
)
model = HfModelForPropertyPrediction.from_pretrained(pretrained_pp_dir)
tokenizer = AutoTokenizer.from_pretrained(pretrained_pp_dir)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_pp_dir)

encodes = tokenizer(train_items[:8], lambda x: x['ques_content'])
# TODO: need to handle inference for T2V for batch or single
Expand Down Expand Up @@ -164,7 +168,7 @@ def test_train_kp(self, standard_luna_data, pretrained_model_dir, pretrained_kp_
model_params=model_params
)
model = HfModelForKnowledgePrediction.from_pretrained(pretrained_kp_dir)
tokenizer = AutoTokenizer.from_pretrained(pretrained_kp_dir)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_kp_dir)

encodes = tokenizer(train_items[:8], lambda x: x['ques_content'])
# TODO: need to handle inference for T2V for batch or single
Expand All @@ -175,10 +179,10 @@ def test_t2v(self, pretrained_model_dir):
{'stem': '如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$, \
若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}
]
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_dir)
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model_dir)
encodes = tokenizer(items, key=lambda x: x['stem'])

t2v = AutoModel(pretrained_model_dir)
t2v = HfAutoModel(pretrained_model_dir)
output = t2v(encodes)
assert output.shape[2] == t2v.vector_size

Expand Down
19 changes: 13 additions & 6 deletions tests/test_pretrain/test_pretrained_quesnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from EduNLP.ModelZoo.quesnet import QuesNet
from EduNLP.Pretrain import QuesNetTokenizer, Question, pretrain_quesnet
# from EduNLP.Pretrain import train_quesnet_for_property_prediction, train_quesnet_for_knowledge_prediction
from EduNLP.Vector import T2V
from EduNLP.Vector import T2V, W2V
from EduNLP.Vector.quesnet import QuesNetModel
from EduNLP.I2V import QuesNet as QuesNetI2V, get_pretrained_i2v
from EduNLP.utils import abs_current_dir, path_append
Expand Down Expand Up @@ -81,12 +81,15 @@ def test_train_quesnet(self, standard_luna_data, pretrained_model_dir):
ques_items = load_items(ques_file)
img_dir = path_append(abs_current_dir(__file__),
"../../static/test_data/quesnet_img", to_str=True)
get_pretrained_i2v("w2v_test_256", pretrained_model_dir)
wv = W2V(os.path.join(pretrained_model_dir, "w2v_test_256/w2v_test_256.kv")).wv
pretrain_quesnet(
ques_items,
pretrained_model_dir,
pretrain_dir=pretrained_model_dir,
img_dir=img_dir,
save_embs=True,
load_embs=False,
load_embs=True,
# data_params={
# "stem_key": "ques_content"
# },
Expand All @@ -96,11 +99,15 @@ def test_train_quesnet(self, standard_luna_data, pretrained_model_dir):
# "per_device_eval_batch_size": 2,
# "no_cuda": not TEST_GPU,
'max_steps': 2,
'feat_size': 256,
'save_every': 1,
'emb_size': 256,
'feat_size': 300,
"log_steps": 1,
'save_every_epochs': 1,
'save_every_steps': 1,
'emb_size': 300,
'feat_size': 300,
'device': "cpu"
}
},
pretrained_wv=wv,
)

tokenizer = QuesNetTokenizer.from_pretrained(pretrained_model_dir, img_dir=img_dir)
Expand Down
Binary file modified trained_ie.pt
Binary file not shown.
Binary file modified trained_me.pt
Binary file not shown.

0 comments on commit 84b79c7

Please sign in to comment.