from __future__ import annotations
from collections.abc import Iterable
from torch import Tensor, nn
from sentence_transformers.SentenceTransformer import SentenceTransformer
from .BatchHardTripletLoss import BatchHardTripletLoss, BatchHardTripletLossDistanceFunction
class BatchAllTripletLoss(nn.Module):
def __init__(
self,
model: SentenceTransformer,
distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance,
margin: float = 5,
) -> None:
"""
BatchAllTripletLoss takes a batch with (sentence, label) pairs and computes the loss for all possible, valid
triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. The labels
must be integers, with same label indicating sentences from the same class. Your train dataset
must contain at least 2 examples per label class.
Args:
model: SentenceTransformer model
distance_metric: Function that returns a distance between
two embeddings. The class SiameseDistanceMetric contains
pre-defined metrics that can be used.
margin: Negative samples should be at least margin further
apart from the anchor than the positive.
References:
* Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
* Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
* Blog post: https://omoindrot.github.io/triplet-loss
Requirements:
1. Each sentence must be labeled with a class.
2. Your dataset must contain at least 2 examples per labels class.
Inputs:
+------------------+--------+
| Texts | Labels |
+==================+========+
| single sentences | class |
+------------------+--------+
Recommendations:
- Use ``BatchSamplers.GROUP_BY_LABEL`` (:class:`docs <sentence_transformers.training_args.BatchSamplers>`) to
ensure that each batch contains 2+ examples per label class.
Relations:
* :class:`BatchHardTripletLoss` uses only the hardest positive and negative samples, rather than all possible, valid triplets.
* :class:`BatchHardSoftMarginTripletLoss` uses only the hardest positive and negative samples, rather than all possible, valid triplets.
Also, it does not require setting a margin.
* :class:`BatchSemiHardTripletLoss` uses only semi-hard triplets, valid triplets, rather than all possible, valid triplets.
Example:
::
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
from datasets import Dataset
model = SentenceTransformer("microsoft/mpnet-base")
# E.g. 0: sports, 1: economy, 2: politics
train_dataset = Dataset.from_dict({
"sentence": [
"He played a great game.",
"The stock is up 20%",
"They won 2-1.",
"The last goal was amazing.",
"They all voted against the bill.",
],
"label": [0, 1, 0, 0, 2],
})
loss = losses.BatchAllTripletLoss(model)
trainer = SentenceTransformerTrainer(
model=model,
train_dataset=train_dataset,
loss=loss,
)
trainer.train()
"""
super().__init__()
self.sentence_embedder = model
self.triplet_margin = margin
self.distance_metric = distance_metric
def forward(self, sentence_features: Iterable[dict[str, Tensor]], labels: Tensor) -> Tensor:
rep = self.sentence_embedder(sentence_features[0])["sentence_embedding"]
return self.batch_all_triplet_loss(labels, rep)
def batch_all_triplet_loss(self, labels: Tensor, embeddings: Tensor) -> Tensor:
"""Build the triplet loss over a batch of embeddings.
We generate all the valid triplets and average the loss over the positive ones.
Args:
labels: labels of the batch, of size (batch_size,)
embeddings: tensor of shape (batch_size, embed_dim)
margin: margin for triplet loss
squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
If false, output is the pairwise euclidean distance matrix.
Returns:
Label_Sentence_Triplet: scalar tensor containing the triplet loss
"""
# Get the pairwise distance matrix
pairwise_dist = self.distance_metric(embeddings)
anchor_positive_dist = pairwise_dist.unsqueeze(2)
anchor_negative_dist = pairwise_dist.unsqueeze(1)
# Compute a 3D tensor of size (batch_size, batch_size, batch_size)
# triplet_loss[i, j, k] will contain the triplet loss of anchor=i, positive=j, negative=k
# Uses broadcasting where the 1st argument has shape (batch_size, batch_size, 1)
# and the 2nd (batch_size, 1, batch_size)
triplet_loss = anchor_positive_dist - anchor_negative_dist + self.triplet_margin
# Put to zero the invalid triplets
# (where label(a) != label(p) or label(n) == label(a) or a == p)
mask = BatchHardTripletLoss.get_triplet_mask(labels)
triplet_loss = mask.float() * triplet_loss
# Remove negative losses (i.e. the easy triplets)
triplet_loss[triplet_loss < 0] = 0
# Count number of positive triplets (where triplet_loss > 0)
valid_triplets = triplet_loss[triplet_loss > 1e-16]
num_positive_triplets = valid_triplets.size(0)
# num_valid_triplets = mask.sum()
# fraction_positive_triplets = num_positive_triplets / (num_valid_triplets.float() + 1e-16)
# Get final mean triplet loss over the positive valid triplets
triplet_loss = triplet_loss.sum() / (num_positive_triplets + 1e-16)
return triplet_loss
@property
def citation(self) -> str:
return """
@misc{hermans2017defense,
title={In Defense of the Triplet Loss for Person Re-Identification},
author={Alexander Hermans and Lucas Beyer and Bastian Leibe},
year={2017},
eprint={1703.07737},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
"""