Skip to content

Commit

Permalink
remove label selecting in dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
donglihe-hub committed Mar 21, 2024
1 parent 15bab60 commit d9c9b50
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 178 deletions.
42 changes: 0 additions & 42 deletions example_config/AmazonCat-13K/attentionxml.yml

This file was deleted.

2 changes: 1 addition & 1 deletion libmultilabel/nn/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def build_label_tree(sparse_x: csr_matrix, sparse_y: csr_matrix, cluster_size: i
for _ in range(height):
next_clusters = []
for cluster in clusters:
next_clusters.extend(_split_cluster(cluster, label_repr[cluster]))
next_clusters += _split_cluster(cluster, label_repr[cluster])
clusters = next_clusters
logger.info(f"Having grouped {len(clusters)} clusters")

Expand Down
75 changes: 13 additions & 62 deletions libmultilabel/nn/datasets_AttentionXML.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ class PlainDataset(Dataset):
this while generating clusters. There is no need to do multilabel binarization again.
Args:
x: texts
y: labels
x (list | ndarray | Tensor): texts
y (Optional: csr_matrix | ndarray | Tensor): labels
"""

def __init__(self, x: list | ndarray | Tensor, y: Optional[csr_matrix | ndarray | Tensor] = None):
def __init__(self, x, y=None):
if y is not None:
assert len(x) == y.shape[0], "Sizes mismatch between texts and labels"
self.x = x
Expand Down Expand Up @@ -56,77 +56,28 @@ class PLTDataset(PlainDataset):
Args:
x: texts
y: labels
num_classes: number of classes.
mapping: mapping from clusters to labels. Shape: (len(clusters), cluster_size).
clusters_selected: sampled predicted clusters from model_0. Shape: (len(x), predict_top_k).
cluster_scores: corresponding scores. Shape: (len(x), predict_top_k)
labels_selected: sampled predicted labels from model_0. Shape: (len(x), predict_top_k).
label_scores: scores for each label. Shape: (len(x), predict_top_k)
"""

def __init__(
self,
x,
y: Optional[csr_matrix | ndarray] = None,
*,
num_classes: int,
mapping: ndarray,
clusters_selected: ndarray | Tensor,
cluster_scores: Optional[ndarray | Tensor] = None,
labels_selected: ndarray | Tensor,
label_scores: Optional[ndarray | Tensor] = None,
):
super().__init__(x, y)
self.num_classes = num_classes
self.mapping = mapping
self.clusters_selected = clusters_selected
self.cluster_scores = cluster_scores
self.label_scores = None

# labels_selected are labels extracted from cluster_selected.
# shape: (len(x), len(clusters_selected) * cluster_size)
self.labels_selected = [
np.concatenate(self.mapping[labels])
for labels in tqdm(self.clusters_selected, leave=False, desc="Retrieving labels from selected clusters")
]
if self.cluster_scores is not None:
# label_scores are probability scores corresponding to labels_selected.
# shape: (len(x), len(clusters_selected) * cluster_size)
self.label_scores = [
np.repeat(scores, [len(i) for i in self.mapping[labels]])
for labels, scores in zip(self.clusters_selected, self.cluster_scores)
]

# top_k * n (n <= cluster_size). number of maximum possible number selected labels at the current level.
self.num_labels_selected = self.clusters_selected.shape[1] * max(len(clusters) for clusters in self.mapping)
self.labels_selected = labels_selected
self.label_scores = label_scores

def __getitem__(self, idx: int):
item = {"text": self.x[idx], "labels_selected": np.asarray(self.labels_selected[idx], dtype=np.int64)}
item = {"text": self.x[idx], "labels_selected": np.asarray(self.labels_selected[idx])}

# train/valid/test
if self.y is not None:
item["label"] = self.y[idx].toarray().squeeze(0)

# train
if self.label_scores is None:
# As networks require input to be of fixed shape, randomly select labels when the number of selected label
# is not enough
if len(item["labels_selected"]) < self.num_labels_selected:
sample = np.random.randint(
self.num_classes, size=self.num_labels_selected - len(item["labels_selected"])
)
item["labels_selected"] = np.concatenate([item["labels_selected"], sample])
# valid/test
else:
item["label_scores"] = self.label_scores[idx]
item["label"] = self.y[idx, item["labels_selected"]].toarray().squeeze(0)

# add dummy elements when less than required
if len(item["labels_selected"]) < self.num_labels_selected:
item["label_scores"] = np.concatenate(
[item["label_scores"], [-np.inf] * (self.num_labels_selected - len(item["labels_selected"]))]
)
item["labels_selected"] = np.concatenate(
[
item["labels_selected"],
[self.num_classes] * (self.num_labels_selected - len(item["labels_selected"])),
]
)

item["label_scores"] = np.asarray(item["label_scores"], dtype=np.float32)
if self.label_scores is not None:
item["label_scores"] = self.label_scores[idx]
return item
21 changes: 1 addition & 20 deletions libmultilabel/nn/model_AttentionXML.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,29 +34,10 @@ def scatter_logits(
"""For each instance, we only have predictions on selected labels. This subroutine maps these predictions to
the whole label space. The scores of unsampled labels are set to 0."""
src = torch.sigmoid(logits.detach()) * label_scores
preds = torch.zeros(
labels_selected.size(0), len(self.classes) + 1, device=labels_selected.device, dtype=src.dtype
)
preds = torch.zeros(labels_selected.size(0), len(self.classes), device=labels_selected.device, dtype=src.dtype)
preds.scatter_(dim=1, index=labels_selected, src=src)
# remove dummy labels
preds = preds[:, :-1]
return preds

def shared_step(self, batch):
"""Return loss and predicted logits of the network.
Args:
batch (dict): A batch of text and label.
Returns:
loss (torch.Tensor): Loss between target and predict logits.
pred_logits (torch.Tensor): The predict logits (batch_size, num_classes).
"""
y = torch.take_along_dim(batch["label"], batch["labels_selected"], dim=1)
logits = self(batch)
loss = self.loss_function(logits, y)
return loss, logits

def _shared_eval_step(self, batch, batch_idx):
logits = self(batch)
logits = self.scatter_logits(logits, batch["labels_selected"], batch["label_scores"])
Expand Down
109 changes: 56 additions & 53 deletions libmultilabel/nn/plt.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,61 +126,66 @@ def __init__(
# save path
self.log_path = config.log_path

def label2cluster(self, cluster_mapping, *ys) -> Generator[csr_matrix, ...]:
def label2cluster(self, cluster_mapping, *labels) -> Generator[csr_matrix, ...]:
"""Map labels to their corresponding clusters in CSR sparse format.
Suppose there are 6 labels and clusters are [(0, 1), (2, 3), (4, 5)] and ys of a given instance is [0, 1, 4].
The clusters of the instance are [0, 2].
Notice that this function deals with SPARSE matrix.
Assume there are 6 labels clustered as [(0, 1), (2, 3), (4, 5)]. Here (0, 1) is cluster with index 0 and so on.
Given the ground-truth labels, [0, 1, 4], the resulting clusters are [0, 2].
Args:
cluster_mapping: mapping from clusters to labels.
*ys: sparse labels.
cluster_mapping (np.ndarray): mapping from clusters to labels generated by build_label_tree.
*labels (csr_matrix): labels in CSR sparse format.
Returns:
Generator[csr_matrix]: clusters generated from labels
Generator[csr_matrix]: resulting clusters converted from labels in CSR sparse format
"""
mapping = np.empty(self.num_classes, dtype=np.uint64)
for idx, clusters in enumerate(cluster_mapping):
mapping[clusters] = idx

def _label2cluster(y: csr_matrix) -> csr_matrix:
def _label2cluster(label: csr_matrix) -> csr_matrix:
row = []
col = []
data = []
for i in range(y.shape[0]):
for i in range(label.shape[0]):
# n include all mapped ancestor clusters
n = np.unique(mapping[y.indices[y.indptr[i] : y.indptr[i + 1]]])
n = np.unique(mapping[label.indices[label.indptr[i] : label.indptr[i + 1]]])
row += [i] * len(n)
col += n.tolist()
data += [1] * len(n)
return csr_matrix((data, (row, col)), shape=(y.shape[0], len(cluster_mapping)))

return (_label2cluster(y) for y in ys)

# def cluster2label(self, cluster_mapping, *ys):
# """Map clusters to their corresponding labels. Notice this function only deals with dense matrix.
#
# Args:
# cluster_mapping: mapping from clusters to labels.
# *ys: sparse clusters.
#
# Returns:
# Generator[csr_matrix]: labels generated from clusters
# """
#
# def _cluster2label(y: csr_matrix) -> csr_matrix:
# self.labels_selected = [np.concatenate(cluster_mapping[labels]) for labels in y]
# return (_cluster2label(y) for y in ys)

# def generate_goals(self, cluster_scores, y):
# if cluster_scores is not None:
# # label_scores are corresponding scores for selected labels and
# # look like [[0.1, 0.1, 0.1, 0.4, 0.4, 0.5, 0.5,...], ...]. shape: (len(x), cluster_size * top_k)
# # notice how scores repeat for each cluster.
# self.label_scores = [
# np.repeat(scores, [len(i) for i in cluster_mapping[labels]])
# for labels, scores in zip(y, cluster_scores)
# ]
return csr_matrix((data, (row, col)), shape=(label.shape[0], len(cluster_mapping)))

return (_label2cluster(label) for label in labels)

@staticmethod
def cluster2label(cluster_mapping, clusters, cluster_scores=None):
"""Expand clusters to their corresponding labels and, if available, assign scores to each label.
Labels inside the same cluster have the same scores. This function is applied to predictions from model 0.
Notice that the behaviors of this function are different from label2cluster.
Also notice that this function deals with DENSE matrix.
Args:
cluster_mapping (np.ndarray): mapping from clusters to labels generated by build_label_tree.
clusters (np.ndarray): predicted clusters from model 0.
cluster_scores (Optional: np.ndarray): predicted scores of each cluster from model 0.
Returns:
Generator[np.ndarray]: resulting labels expanded from clusters
"""

labels_selected = []

if cluster_scores is not None:
# label_scores are corresponding scores for selected labels and
# shape: (len(x), cluster_size * top_k)
label_scores = []
for score, cluster in zip(cluster_scores, clusters):
label_scores += [np.repeat(score, [len(labels) for labels in cluster_mapping[cluster]])]
labels_selected += [np.concatenate(cluster_mapping[cluster])]
return labels_selected, label_scores
else:
labels_selected = [np.concatenate(cluster_mapping[cluster]) for cluster in clusters]
return labels_selected

def fit(self, datasets):
"""fit model to the training dataset
Expand Down Expand Up @@ -291,7 +296,7 @@ def fit(self, datasets):
val_scores_pred = expit(np.vstack([i["top_k_pred_scores"] for i in val_pred]))
val_clusters_pred = np.vstack([i["top_k_pred"] for i in val_pred])

clusters_selected = np.empty((len(train_x), self.predict_top_k), dtype=np.int64)
train_clusters_selected = np.empty((len(train_x), self.predict_top_k), dtype=np.int64)
for i, ys in enumerate(tqdm(train_clusters_pred, leave=False, desc="Sampling clusters")):
# relevant clusters are positive
pos = set(train_y_clustered.indices[train_y_clustered.indptr[i] : train_y_clustered.indptr[i + 1]])
Expand All @@ -317,7 +322,10 @@ def fit(self, datasets):
break
if len(selected) < self.predict_top_k:
selected = (list(selected) + list(pos - selected))[: self.predict_top_k]
clusters_selected[i] = np.asarray(list(selected))
train_clusters_selected[i] = np.asarray(list(selected))

train_labels_selected = PLTTrainer.cluster2label(clusters, train_clusters_selected)
val_labels_pred, val_scores_pred = PLTTrainer.cluster2label(clusters, val_clusters_pred, val_scores_pred)

trainer = init_trainer(
self.checkpoint_dir,
Expand All @@ -339,20 +347,16 @@ def fit(self, datasets):
PLTDataset(
train_x,
train_y,
num_classes=self.num_classes,
mapping=clusters,
clusters_selected=clusters_selected,
labels_selected=train_labels_selected,
),
shuffle=self.shuffle,
)
val_dataloader = self.dataloader(
PLTDataset(
val_x,
val_y,
num_classes=self.num_classes,
mapping=clusters,
clusters_selected=val_clusters_pred,
cluster_scores=val_scores_pred,
labels_selected=val_labels_pred,
label_scores=val_scores_pred,
),
)

Expand Down Expand Up @@ -435,19 +439,18 @@ def test(self, dataset):

logger.info(f"Predicting level 0, Top: {self.predict_top_k}")
test_pred = trainer.predict(model_0, test_dataloader)
test_pred_scores = expit(np.vstack([i["top_k_pred_scores"] for i in test_pred]))
test_pred_cluters = np.vstack([i["top_k_pred"] for i in test_pred])
test_scores_pred = expit(np.vstack([i["top_k_pred_scores"] for i in test_pred]))
test_clusters_pred = np.vstack([i["top_k_pred"] for i in test_pred])

clusters = np.load(self.get_cluster_path(), allow_pickle=True)
test_labels_pred, test_scores_pred = PLTTrainer.cluster2label(clusters, test_clusters_pred, test_scores_pred)

test_dataloader = self.eval_dataloader(
PLTDataset(
test_x,
test_y,
num_classes=self.num_classes,
mapping=clusters,
clusters_selected=test_pred_cluters,
cluster_scores=test_pred_scores,
labels_selected=test_labels_pred,
label_scores=test_scores_pred,
),
)

Expand Down

0 comments on commit d9c9b50

Please sign in to comment.