from __future__ import annotations
import logging
import shutil
import tempfile
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Literal
import huggingface_hub
from sentence_transformers.util import disable_datasets_caching, is_datasets_available
logger = logging.getLogger(__name__)
if TYPE_CHECKING:
from sentence_transformers.SentenceTransformer import SentenceTransformer
try:
from optimum.intel import OVQuantizationConfig
except ImportError:
pass
try:
from optimum.onnxruntime.configuration import OptimizationConfig, QuantizationConfig
except ImportError:
pass
def export_optimized_onnx_model(
model: SentenceTransformer,
optimization_config: OptimizationConfig | Literal["O1", "O2", "O3", "O4"],
model_name_or_path: str,
push_to_hub: bool = False,
create_pr: bool = False,
file_suffix: str | None = None,
) -> None:
"""
Export an optimized ONNX model from a SentenceTransformer model.
The O1-O4 optimization levels are defined by Optimum and are documented here:
https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/optimization
The optimization levels are:
- O1: basic general optimizations.
- O2: basic and extended general optimizations, transformers-specific fusions.
- O3: same as O2 with GELU approximation.
- O4: same as O3 with mixed precision (fp16, GPU-only)
See https://sbert.net/docs/sentence_transformer/usage/efficiency.html for more information & benchmarks.
Args:
model (SentenceTransformer): The SentenceTransformer model to be optimized. Must be loaded with `backend="onnx"`.
optimization_config (OptimizationConfig | Literal["O1", "O2", "O3", "O4"]): The optimization configuration or level.
model_name_or_path (str): The path or Hugging Face Hub repository name where the optimized model will be saved.
push_to_hub (bool, optional): Whether to push the optimized model to the Hugging Face Hub. Defaults to False.
create_pr (bool, optional): Whether to create a pull request when pushing to the Hugging Face Hub. Defaults to False.
file_suffix (str | None, optional): The suffix to add to the optimized model file name. Defaults to None.
Raises:
ImportError: If the required packages `optimum` and `onnxruntime` are not installed.
ValueError: If the provided model is not a valid SentenceTransformer model loaded with `backend="onnx"`.
ValueError: If the provided optimization_config is not valid.
Returns:
None
"""
from sentence_transformers import SentenceTransformer
from sentence_transformers.models.Transformer import Transformer
try:
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTOptimizer
from optimum.onnxruntime.configuration import AutoOptimizationConfig
except ImportError:
raise ImportError(
"Please install Optimum and ONNX Runtime to use this function. "
"You can install them with pip: `pip install optimum[onnxruntime]` "
"or `pip install optimum[onnxruntime-gpu]`"
)
if (
not isinstance(model, SentenceTransformer)
or not len(model)
or not isinstance(model[0], Transformer)
or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
):
raise ValueError(
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
)
ort_model: ORTModelForFeatureExtraction = model[0].auto_model
optimizer = ORTOptimizer.from_pretrained(ort_model)
if isinstance(optimization_config, str):
if optimization_config not in AutoOptimizationConfig._LEVELS:
raise ValueError(
"optimization_config must be an OptimizationConfig instance or one of 'O1', 'O2', 'O3', 'O4'."
)
file_suffix = file_suffix or optimization_config
optimization_config = getattr(AutoOptimizationConfig, optimization_config)()
if file_suffix is None:
file_suffix = "optimized"
save_or_push_to_hub_model(
export_function=lambda save_dir: optimizer.optimize(optimization_config, save_dir, file_suffix=file_suffix),
export_function_name="export_optimized_onnx_model",
config=optimization_config,
model_name_or_path=model_name_or_path,
push_to_hub=push_to_hub,
create_pr=create_pr,
file_suffix=file_suffix,
backend="onnx",
)
def export_dynamic_quantized_onnx_model(
model: SentenceTransformer,
quantization_config: QuantizationConfig | Literal["arm64", "avx2", "avx512", "avx512_vnni"],
model_name_or_path: str,
push_to_hub: bool = False,
create_pr: bool = False,
file_suffix: str | None = None,
) -> None:
"""
Export a quantized ONNX model from a SentenceTransformer model.
This function applies dynamic quantization, i.e. without a calibration dataset.
Each of the default quantization configurations quantize the model to int8, allowing
for faster inference on CPUs, but are likely slower on GPUs.
See https://sbert.net/docs/sentence_transformer/usage/efficiency.html for more information & benchmarks.
Args:
model (SentenceTransformer): The SentenceTransformer model to be quantized. Must be loaded with `backend="onnx"`.
quantization_config (QuantizationConfig): The quantization configuration.
model_name_or_path (str): The path or Hugging Face Hub repository name where the quantized model will be saved.
push_to_hub (bool, optional): Whether to push the quantized model to the Hugging Face Hub. Defaults to False.
create_pr (bool, optional): Whether to create a pull request when pushing to the Hugging Face Hub. Defaults to False.
file_suffix (str | None, optional): The suffix to add to the quantized model file name. Defaults to None.
Raises:
ImportError: If the required packages `optimum` and `onnxruntime` are not installed.
ValueError: If the provided model is not a valid SentenceTransformer model loaded with `backend="onnx"`.
ValueError: If the provided quantization_config is not valid.
Returns:
None
"""
from sentence_transformers import SentenceTransformer
from sentence_transformers.models.Transformer import Transformer
try:
from optimum.onnxruntime import ORTModelForFeatureExtraction, ORTQuantizer
from optimum.onnxruntime.configuration import AutoQuantizationConfig
except ImportError:
raise ImportError(
"Please install Optimum and ONNX Runtime to use this function. "
"You can install them with pip: `pip install optimum[onnxruntime]` "
"or `pip install optimum[onnxruntime-gpu]`"
)
if (
not isinstance(model, SentenceTransformer)
or not len(model)
or not isinstance(model[0], Transformer)
or not isinstance(model[0].auto_model, ORTModelForFeatureExtraction)
):
raise ValueError(
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="onnx"`.'
)
ort_model: ORTModelForFeatureExtraction = model[0].auto_model
quantizer = ORTQuantizer.from_pretrained(ort_model)
if isinstance(quantization_config, str):
if quantization_config not in ["arm64", "avx2", "avx512", "avx512_vnni"]:
raise ValueError(
"quantization_config must be an QuantizationConfig instance or one of 'arm64', 'avx2', 'avx512', or 'avx512_vnni'."
)
quantization_config_name = quantization_config[:]
quantization_config = getattr(AutoQuantizationConfig, quantization_config)(is_static=False)
file_suffix = file_suffix or f"{quantization_config.weights_dtype.name.lower()}_{quantization_config_name}"
if file_suffix is None:
file_suffix = f"{quantization_config.weights_dtype.name.lower()}_quantized"
save_or_push_to_hub_model(
export_function=lambda save_dir: quantizer.quantize(quantization_config, save_dir, file_suffix=file_suffix),
export_function_name="export_dynamic_quantized_onnx_model",
config=quantization_config,
model_name_or_path=model_name_or_path,
push_to_hub=push_to_hub,
create_pr=create_pr,
file_suffix=file_suffix,
backend="onnx",
)
def export_static_quantized_openvino_model(
model: SentenceTransformer,
quantization_config: OVQuantizationConfig | dict | None,
model_name_or_path: str,
dataset_name: str | None = None,
dataset_config_name: str | None = None,
dataset_split: str | None = None,
column_name: str | None = None,
push_to_hub: bool = False,
create_pr: bool = False,
file_suffix: str = "qint8_quantized",
) -> None:
"""
Export a quantized OpenVINO model from a SentenceTransformer model.
This function applies Post-Training Static Quantization (PTQ) using a calibration dataset, which calibrates
quantization constants without requiring model retraining. Each default quantization configuration converts
the model to int8 precision, enabling faster inference while maintaining accuracy.
See https://sbert.net/docs/sentence_transformer/usage/efficiency.html for more information & benchmarks.
Args:
model (SentenceTransformer): The SentenceTransformer model to be quantized. Must be loaded with `backend="openvino"`.
quantization_config (OVQuantizationConfig | dict | None): The quantization configuration. If None, default values are used.
model_name_or_path (str): The path or Hugging Face Hub repository name where the quantized model will be saved.
dataset_name(str, optional): The name of the dataset to load for calibration.
If not specified, the `sst2` subset of the `glue` dataset will be used by default.
dataset_config_name (str, optional): The specific configuration of the dataset to load.
dataset_split (str, optional): The split of the dataset to load (e.g., 'train', 'test'). Defaults to None.
column_name (str, optional): The column name in the dataset to use for calibration. Defaults to None.
push_to_hub (bool, optional): Whether to push the quantized model to the Hugging Face Hub. Defaults to False.
create_pr (bool, optional): Whether to create a pull request when pushing to the Hugging Face Hub. Defaults to False.
file_suffix (str, optional): The suffix to add to the quantized model file name. Defaults to `qint8_quantized`.
Raises:
ImportError: If the required packages `optimum` and `openvino` are not installed.
ValueError: If the provided model is not a valid SentenceTransformer model loaded with `backend="openvino"`.
ValueError: If the provided quantization_config is not valid.
Returns:
None
"""
from sentence_transformers import SentenceTransformer
from sentence_transformers.models.Transformer import Transformer
try:
from optimum.intel import OVConfig, OVModelForFeatureExtraction, OVQuantizationConfig, OVQuantizer
except ImportError:
raise ImportError(
"Please install datasets, optimum-intel and openvino to use this function. "
"You can install them with pip: `pip install datasets optimum[openvino]`"
)
if not is_datasets_available():
raise ImportError(
"Please install datasets to use this function. You can install it with pip: `pip install datasets`"
)
if (
not isinstance(model, SentenceTransformer)
or not len(model)
or not isinstance(model[0], Transformer)
or not isinstance(model[0].auto_model, OVModelForFeatureExtraction)
):
raise ValueError(
'The model must be a Transformer-based SentenceTransformer model loaded with `backend="openvino"`.'
)
if quantization_config is None:
quantization_config = OVQuantizationConfig()
ov_model: OVModelForFeatureExtraction = model[0].auto_model
ov_config = OVConfig(quantization_config=quantization_config)
quantizer = OVQuantizer.from_pretrained(ov_model)
if any(param is not None for param in [dataset_name, dataset_config_name, dataset_split, column_name]) and not all(
param is not None for param in [dataset_name, dataset_config_name, dataset_split, column_name]
):
raise ValueError(
"Either specify all of `dataset_name`, `dataset_config_name`, `dataset_split`, and `column_name`, or leave them all unspecified."
)
def preprocess_function(examples):
return model.tokenizer(examples, padding="max_length", max_length=384, truncation=True)
dataset_name = dataset_name if dataset_name is not None else "glue"
dataset_config_name = dataset_config_name if dataset_config_name is not None else "sst2"
dataset_split = dataset_split if dataset_split is not None else "train"
column_name = column_name if column_name is not None else "sentence"
with disable_datasets_caching():
calibration_dataset = quantizer.get_calibration_dataset(
dataset_name=dataset_name,
dataset_config_name=dataset_config_name,
preprocess_function=lambda examples: preprocess_function(examples[column_name]),
num_samples=quantization_config.num_samples if quantization_config is not None else 300,
dataset_split=dataset_split,
)
save_or_push_to_hub_model(
export_function=lambda save_dir: quantizer.quantize(
calibration_dataset, save_directory=save_dir, ov_config=ov_config
),
export_function_name="export_static_quantized_openvino_model",
config=quantization_config,
model_name_or_path=model_name_or_path,
push_to_hub=push_to_hub,
create_pr=create_pr,
file_suffix=file_suffix,
backend="openvino",
)
def save_or_push_to_hub_model(
export_function: Callable,
export_function_name: str,
config,
model_name_or_path: str,
push_to_hub: bool = False,
create_pr: bool = False,
file_suffix: str | None = None,
backend: str = "onnx",
):
if backend == "onnx":
file_name = f"model_{file_suffix}.onnx"
elif backend == "openvino":
file_name = f"openvino_model_{file_suffix}.xml"
with tempfile.TemporaryDirectory() as save_dir:
export_function(save_dir)
# OpenVINO models are saved in a nested directory
if backend == "openvino":
save_dir = Path(save_dir) / backend
# and we need to attach the file_suffix for both the .xml and .bin files
shutil.move(save_dir / "openvino_model.xml", save_dir / file_name)
shutil.move(save_dir / "openvino_model.bin", (save_dir / file_name).with_suffix(".bin"))
save_dir = save_dir.as_posix()
# Because we upload folders and save_dir now has unnecessary files (tokenizer.json, config.json, etc.),
# we move the main file to a nested directory
if backend == "onnx":
dst_dir = Path(save_dir) / backend
dst_dir.mkdir(parents=True, exist_ok=True)
source = Path(save_dir) / file_name
destination = dst_dir / file_name
shutil.move(source, destination)
save_dir = dst_dir.as_posix()
if push_to_hub:
commit_description = ""
if create_pr:
opt_config_string = repr(config).replace("(", "(\n\t").replace(", ", ",\n\t").replace(")", "\n)")
commit_description = f"""\
Hello!
*This pull request has been automatically generated from the [`{export_function_name}`](https://sbert.net/docs/package_reference/util.html#sentence_transformers.backend.{export_function_name}) function from the Sentence Transformers library.*
## Config
```python
{opt_config_string}
```
## Tip:
Consider testing this pull request before merging by loading the model from this PR with the `revision` argument:
```python
from sentence_transformers import SentenceTransformer
# TODO: Fill in the PR number
pr_number = 2
model = SentenceTransformer(
"{model_name_or_path}",
revision=f"refs/pr/{{pr_number}}",
backend="{backend}",
model_kwargs={{"file_name": "{file_name}"}},
)
# Verify that everything works as expected
embeddings = model.encode(["The weather is lovely today.", "It's so sunny outside!", "He drove to the stadium."])
print(embeddings.shape)
similarities = model.similarity(embeddings, embeddings)
print(similarities)
```
"""
huggingface_hub.upload_folder(
folder_path=save_dir,
path_in_repo=backend,
repo_id=model_name_or_path,
repo_type="model",
commit_message=f"Add exported {backend} model {file_name!r}",
commit_description=commit_description,
create_pr=create_pr,
)
else:
dst_dir = Path(model_name_or_path) / backend
# Create destination if it does not exist
dst_dir.mkdir(parents=True, exist_ok=True)
source = Path(save_dir) / file_name
destination = dst_dir / file_name
shutil.copy(source, destination)
# OpenVINO has a second file to save: the .bin file
if backend == "openvino":
bin_source = (Path(save_dir) / file_name).with_suffix(".bin")
bin_destination = (Path(dst_dir) / file_name).with_suffix(".bin")
shutil.copy(bin_source, bin_destination)