from __future__ import annotations
import math
import os
from pathlib import Path
import numpy as np
import torch
from safetensors.torch import load_file as load_safetensors_file
from safetensors.torch import save_file as save_safetensors_file
from tokenizers import Tokenizer
from torch import nn
from transformers import PreTrainedTokenizerFast
from sentence_transformers.util import get_device_name
class StaticEmbedding(nn.Module):
def __init__(
self,
tokenizer: Tokenizer | PreTrainedTokenizerFast,
embedding_weights: np.array | torch.Tensor | None = None,
embedding_dim: int | None = None,
**kwargs,
) -> None:
"""
Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
takes the mean of trained per-token embeddings to compute text embeddings.
Args:
tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
from ``transformers`` or ``tokenizers``.
embedding_weights (np.array | torch.Tensor | None, optional): Pre-trained embedding weights.
Defaults to None.
embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
is not provided. Defaults to None.
Example::
from sentence_transformers import SentenceTransformer
from sentence_transformers.models import StaticEmbedding
from tokenizers import Tokenizer
# Pre-distilled embeddings:
static_embedding = StaticEmbedding.from_model2vec("minishlab/M2V_base_output")
# or distill your own embeddings:
static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
# or start with randomized embeddings:
tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)
model = SentenceTransformer(modules=[static_embedding])
embeddings = model.encode(["What are Pandas?", "The giant panda (Ailuropoda melanoleuca; Chinese: 大熊猫; pinyin: dàxióngmāo), also known as the panda bear or simply the panda, is a bear native to south central China."])
similarity = model.similarity(embeddings[0], embeddings[1])
# tensor([[0.9177]]) (If you use the distilled bge-base)
Raises:
ValueError: If the tokenizer is not a fast tokenizer.
ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
"""
super().__init__()
if isinstance(tokenizer, PreTrainedTokenizerFast):
tokenizer = tokenizer._tokenizer
elif not isinstance(tokenizer, Tokenizer):
raise ValueError(
"The tokenizer must be fast (i.e. Rust-backed) to use this class. "
"Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer."
)
if embedding_weights is not None:
if isinstance(embedding_weights, np.ndarray):
embedding_weights = torch.from_numpy(embedding_weights)
self.embedding = nn.EmbeddingBag.from_pretrained(embedding_weights, freeze=False)
elif embedding_dim is not None:
self.embedding = nn.EmbeddingBag(tokenizer.get_vocab_size(), embedding_dim)
else:
raise ValueError("Either `embedding_weights` or `embedding_dim` must be provided.")
self.num_embeddings = self.embedding.num_embeddings
self.embedding_dim = self.embedding.embedding_dim
self.tokenizer: Tokenizer = tokenizer
self.tokenizer.no_padding()
# For the model card
self.base_model = kwargs.get("base_model", None)
def tokenize(self, texts: list[str], **kwargs) -> dict[str, torch.Tensor]:
encodings = self.tokenizer.encode_batch(texts, add_special_tokens=False)
encodings_ids = [encoding.ids for encoding in encodings]
offsets = torch.from_numpy(np.cumsum([0] + [len(token_ids) for token_ids in encodings_ids[:-1]]))
input_ids = torch.tensor([token_id for token_ids in encodings_ids for token_id in token_ids], dtype=torch.long)
return {"input_ids": input_ids, "offsets": offsets}
def forward(self, features: dict[str, torch.Tensor], **kwargs) -> dict[str, torch.Tensor]:
features["sentence_embedding"] = self.embedding(features["input_ids"], features["offsets"])
return features
def get_config_dict(self) -> dict[str, float]:
return {}
@property
def max_seq_length(self) -> int:
return math.inf
def get_sentence_embedding_dimension(self) -> int:
return self.embedding_dim
def save(self, save_dir: str, safe_serialization: bool = True, **kwargs) -> None:
if safe_serialization:
save_safetensors_file(self.state_dict(), os.path.join(save_dir, "model.safetensors"))
else:
torch.save(self.state_dict(), os.path.join(save_dir, "pytorch_model.bin"))
self.tokenizer.save(str(Path(save_dir) / "tokenizer.json"))
def load(load_dir: str, **kwargs) -> StaticEmbedding:
tokenizer = Tokenizer.from_file(str(Path(load_dir) / "tokenizer.json"))
if os.path.exists(os.path.join(load_dir, "model.safetensors")):
weights = load_safetensors_file(os.path.join(load_dir, "model.safetensors"))
else:
weights = torch.load(
os.path.join(load_dir, "pytorch_model.bin"), map_location=torch.device("cpu"), weights_only=True
)
weights = weights["embedding.weight"]
return StaticEmbedding(tokenizer, embedding_weights=weights)
@classmethod
def from_distillation(
cls,
model_name: str,
vocabulary: list[str] | None = None,
device: str | None = None,
pca_dims: int | None = 256,
apply_zipf: bool = True,
use_subword: bool = True,
) -> StaticEmbedding:
"""
Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.
Args:
model_name (str): The name of the model to distill.
vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
the strongest device is automatically detected. Defaults to None.
pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
use_subword (bool): Whether to use subword tokenization. Defaults to True.
Returns:
StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
tokenizer and embedding weights.
Raises:
ImportError: If the `model2vec` package is not installed.
"""
try:
from model2vec.distill import distill
except ImportError:
raise ImportError(
"To use this method, please install the `model2vec` package: `pip install model2vec[distill]`"
)
device = get_device_name()
static_model = distill(
model_name,
vocabulary=vocabulary,
device=device,
pca_dims=pca_dims,
apply_zipf=apply_zipf,
use_subword=use_subword,
)
if isinstance(static_model.embedding, np.ndarray):
embedding_weights = torch.from_numpy(static_model.embedding)
else:
embedding_weights = static_model.embedding.weight
tokenizer: Tokenizer = static_model.tokenizer
return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_name)
@classmethod
def from_model2vec(cls, model_id_or_path: str) -> StaticEmbedding:
"""
Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.
Args:
model_id_or_path (str): The identifier or path to the pre-trained model2vec model.
Returns:
StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
the model2vec model.
Raises:
ImportError: If the `model2vec` package is not installed.
"""
try:
from model2vec import StaticModel
except ImportError:
raise ImportError("To use this method, please install the `model2vec` package: `pip install model2vec`")
static_model = StaticModel.from_pretrained(model_id_or_path)
if isinstance(static_model.embedding, np.ndarray):
embedding_weights = torch.from_numpy(static_model.embedding)
else:
embedding_weights = static_model.embedding.weight
tokenizer: Tokenizer = static_model.tokenizer
return cls(tokenizer, embedding_weights=embedding_weights, base_model=model_id_or_path)