"""Declares specification of the Transformer model.""" from typing import Optional, Tuple, Union import numpy as np from ctranslate2.specs import attention_spec, common_spec, model_spec class TransformerEncoderSpec(model_spec.LayerSpec): def __init__( self, num_layers: int, num_heads: int, pre_norm: bool = True, no_final_norm: bool = False, activation: common_spec.Activation = common_spec.Activation.RELU, num_source_embeddings: int = 1, embeddings_merge: common_spec.EmbeddingsMerge = common_spec.EmbeddingsMerge.CONCAT, layernorm_embedding: bool = False, relative_position: bool = False, relative_attention_bias: bool = False, ffn_glu: bool = False, rms_norm: bool = False, multi_query_attention: bool = False, ): """Initializes a Transformer encoder specification. Args: num_layers: Number of layers. num_heads: Number of attention heads. pre_norm: Enable the pre-norm Transformer architecture. no_final_norm: Disable the final layer norm in the pre-norm architecture. activation: Activation to apply in the feed-forward network. num_source_embeddings: Number of source embeddings. embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the embeddings are merged. layernorm_embedding: Apply layer normalization after the embedding layer. relative_position: Use relative position representations in the self-attention layers as described in https://arxiv.org/abs/1803.02155. relative_attention_bias: Use relative attention bias in the self-attention layers as described in the T5 paper https://arxiv.org/abs/1910.10683. ffn_glu: Use gated linear units in the FFN layers as described in https://arxiv.org/abs/2002.05202. rms_norm: Use the root mean square layer normalization. multi_query_attention: Use multi-query attention. """ self.multi_query_attention = multi_query_attention self.num_heads = np.dtype("int16").type(num_heads) self.pre_norm = pre_norm self.activation = np.dtype("int8").type(activation) self.embeddings_merge = np.dtype("int8").type(embeddings_merge) self.embeddings = [ common_spec.EmbeddingsSpec() for _ in range(num_source_embeddings) ] self.scale_embeddings = True if not relative_position and not relative_attention_bias: self.position_encodings = PositionEncoderSpec() if pre_norm and not no_final_norm: self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) if layernorm_embedding: self.layernorm_embedding = common_spec.LayerNormSpec(rms_norm=rms_norm) self.layer = [ TransformerEncoderLayerSpec( relative_position=relative_position, relative_attention_bias=relative_attention_bias, ffn_glu=ffn_glu, rms_norm=rms_norm, num_heads_kv=1 if multi_query_attention else None, ) for _ in range(num_layers) ] class TransformerDecoderSpec(model_spec.LayerSpec): def __init__( self, num_layers: int, num_heads: int, pre_norm: bool = True, activation: common_spec.Activation = common_spec.Activation.RELU, layernorm_embedding: bool = False, with_encoder_attention: bool = True, no_final_norm: bool = False, project_in_out: bool = False, relative_position: bool = False, relative_attention_bias: bool = False, alignment_layer: int = -1, alignment_heads: int = 1, ffn_glu: bool = False, rms_norm: bool = False, alibi: bool = False, alibi_use_positive_positions: bool = False, scale_alibi: bool = False, rotary_dim: Optional[int] = None, rotary_interleave: bool = True, rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None, rotary_scaling_factor: float = 1, rotary_base: float = 10000, original_max_position_embeddings: int = 0, max_position_embeddings: int = 0, parallel_residual: bool = False, shared_layer_norm: bool = False, pre_post_layer_norm: bool = False, multi_query_attention: bool = False, num_heads_kv: Optional[int] = None, head_dim: Optional[int] = None, sliding_window: Optional[int] = None, quant_type: Optional[common_spec.Quantization] = None, quant_group_size: Optional[int] = None, quant_bits: Optional[int] = None, ): """Initializes a Transformer decoder specification. Args: num_layers: Number of layers. num_heads: Number of attention heads. pre_norm: Enable the pre-norm Transformer architecture. activation: Activation to apply in the feed-forward network. layernorm_embedding: Apply layer normalization after the embedding layer. with_encoder_attention: Enable the encoder attention sublayers. no_final_norm: Disable the final layer norm in the pre-norm architecture. project_in_out: Add linear transformations after the embedding layer and before the final layer. relative_position: Use relative position representations in the self-attention layers as described in https://arxiv.org/abs/1803.02155. relative_attention_bias: Use relative attention bias in the self-attention layers as described in the T5 paper https://arxiv.org/abs/1910.10683. alignment_layer: Layer index selected for alignment. alignment_heads: Number of attention heads selected for alignment. ffn_glu: Use gated linear units in the FFN layers as described in https://arxiv.org/abs/2002.05202. rms_norm: Use the root mean square layer normalization. alibi: Use attention with linear biases. alibi_use_positive_positions: Use positive positions in the ALiBi definition. scale_alibi: Apply the dot product scale factor to ALiBi. rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary embeddings are applied to all dimensions. rotary_interleave: Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. rotary_scaling_type: Type of RoPE scaling. rotary_scaling_factor: Factor used in the RoPE scaling. rotary_base: The base period of the rotary embeddings. original_max_position_embeddings: The original max position embeddings for Su rope embeddings max_position_embeddings: The max position embeddings for Su rope embeddings parallel_residual: Use parallel residual connections in each layer block, as used by the GPT-J and GPT-NeoX models. shared_layer_norm: When using parallel residual, share the input and post attention layer norms. pre_post_layer_norm: Add post layer norm for each pre norm layer multi_query_attention: Use multi-query attention (alias for num_heads_kv=1). num_heads_kv: Number of attention heads for the key and value. sliding_window: Max sequence length to retain in KV Cache. quant_type: quantization type used (like awq... for lower bit quantization) quant_group_size: group size of the lower bit quantization quant_bits: number of bit of the quantization (ex: 4bit) """ self._config = dict() if parallel_residual: if not pre_norm: raise ValueError("The GPT-J block expects a pre-norm architecture") if with_encoder_attention: raise ValueError("The GPT-J block does not have cross attention") if multi_query_attention: if num_heads_kv is not None and num_heads_kv != 1: raise ValueError( "Enabling multi_query_attention implies num_heads_kv=1" ) num_heads_kv = 1 if with_encoder_attention and num_heads_kv not in (None, 1, num_heads): raise ValueError( "num_heads_kv=%d is not supported in the cross-attention layers" % num_heads_kv ) self.num_heads = np.dtype("int16").type(num_heads) self.pre_norm = pre_norm self.activation = np.dtype("int8").type(activation) self.alignment_layer = np.dtype("int16").type(alignment_layer) self.alignment_heads = np.dtype("int16").type(alignment_heads) self.embeddings = common_spec.EmbeddingsSpec() self.scale_embeddings = True self.scale_outputs = model_spec.OPTIONAL self.alibi = alibi self.alibi_use_positive_positions = alibi_use_positive_positions self.scale_alibi = scale_alibi if sliding_window is not None: self.sliding_window = np.dtype("int32").type(sliding_window) if ( not relative_position and not relative_attention_bias and not alibi and rotary_dim is None ): self.position_encodings = PositionEncoderSpec() if pre_norm and not no_final_norm: self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) if layernorm_embedding: self.layernorm_embedding = common_spec.LayerNormSpec(rms_norm=rms_norm) self.projection = common_spec.LinearSpec() self.layer = [ TransformerDecoderLayerSpec( with_encoder_attention=with_encoder_attention, relative_position=relative_position, relative_attention_bias=relative_attention_bias, ffn_glu=ffn_glu, rms_norm=rms_norm, rotary_dim=rotary_dim, rotary_interleave=rotary_interleave, rotary_scaling_type=rotary_scaling_type, rotary_scaling_factor=rotary_scaling_factor, rotary_base=rotary_base, original_max_position_embeddings=original_max_position_embeddings, max_position_embeddings=max_position_embeddings, parallel_residual=parallel_residual, shared_layer_norm=shared_layer_norm, pre_post_layer_norm=pre_post_layer_norm, num_heads_kv=num_heads_kv, head_dim=head_dim, sliding_window=sliding_window, ) for _ in range(num_layers) ] self.start_from_zero_embedding = False self._config["multi_query_attention"] = multi_query_attention or ( num_heads_kv != num_heads ) if project_in_out: self.project_in = common_spec.LinearSpec() self.project_out = common_spec.LinearSpec() if quant_type is not None: self._config["quantization_type"] = quant_type self._config["quantization_bits"] = quant_bits self._config["quantization_group_size"] = quant_group_size @property def config(self): return self._config class TransformerEncoderLayerSpec(model_spec.LayerSpec): def __init__( self, relative_position=False, relative_attention_bias=False, ffn_glu=False, rms_norm=False, num_heads_kv=None, sliding_window=None, ): self.self_attention = attention_spec.MultiHeadAttentionSpec( self_attention=True, relative_position=relative_position, relative_attention_bias=relative_attention_bias, rms_norm=rms_norm, num_heads_kv=num_heads_kv, sliding_window=sliding_window, ) self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm) class TransformerDecoderLayerSpec(model_spec.LayerSpec): def __init__( self, with_encoder_attention=True, relative_position=False, relative_attention_bias=False, ffn_glu=False, rms_norm=False, rotary_dim=None, rotary_interleave=True, rotary_scaling_type=None, rotary_scaling_factor=1, rotary_base=10000, original_max_position_embeddings=0, max_position_embeddings=0, parallel_residual=False, shared_layer_norm=False, pre_post_layer_norm=False, num_heads_kv=None, head_dim=None, sliding_window=None, ): self.self_attention = attention_spec.MultiHeadAttentionSpec( self_attention=True, relative_position=relative_position, relative_attention_bias=relative_attention_bias, rms_norm=rms_norm, rotary_dim=rotary_dim, rotary_interleave=rotary_interleave, rotary_scaling_type=rotary_scaling_type, rotary_scaling_factor=rotary_scaling_factor, rotary_base=rotary_base, original_max_position_embeddings=original_max_position_embeddings, max_position_embeddings=max_position_embeddings, num_heads_kv=num_heads_kv, head_dim=head_dim, sliding_window=sliding_window, ) if with_encoder_attention: self.attention = attention_spec.MultiHeadAttentionSpec( rms_norm=rms_norm, num_heads_kv=num_heads_kv, sliding_window=sliding_window, ) self.ffn = FeedForwardSpec(glu=ffn_glu, rms_norm=rms_norm) if parallel_residual: if shared_layer_norm: self.shared_layer_norm = common_spec.LayerNormSpec() else: self.input_layer_norm = common_spec.LayerNormSpec() self.post_attention_layer_norm = common_spec.LayerNormSpec() delattr(self.self_attention, "layer_norm") delattr(self.ffn, "layer_norm") if pre_post_layer_norm: self.input_layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) self.post_attention_layer_norm = common_spec.LayerNormSpec( rms_norm=rms_norm ) self.pre_feedforward_layer_norm = common_spec.LayerNormSpec( rms_norm=rms_norm ) self.post_feedforward_layer_norm = common_spec.LayerNormSpec( rms_norm=rms_norm ) delattr(self.self_attention, "layer_norm") delattr(self.ffn, "layer_norm") class FeedForwardSpec(model_spec.LayerSpec): def __init__(self, glu=False, rms_norm=False): self.layer_norm = common_spec.LayerNormSpec(rms_norm=rms_norm) self.linear_0 = common_spec.LinearSpec() self.linear_1 = common_spec.LinearSpec() if glu: self.linear_0_noact = common_spec.LinearSpec() class PositionEncoderSpec(model_spec.LayerSpec): def __init__(self): self.encodings = model_spec.OPTIONAL class TransformerConfig(model_spec.SequenceToSequenceModelConfig): """Configuration for Transformer models.""" def __init__(self, layer_norm_epsilon: Optional[float] = None, **kwargs): """Initializes the configuration for Transformer models. Args: layer_norm_epsilon: The layer norm epsilon value. **kwargs: Additional configuration. """ super().__init__(layer_norm_epsilon=layer_norm_epsilon, **kwargs) class TransformerSpec(model_spec.SequenceToSequenceModelSpec): """Describes a Transformer model. The specification is invariant to hidden dimensions but requires to explicitly set the number of layers and attention heads. """ def __init__( self, encoder: TransformerEncoderSpec, decoder: TransformerDecoderSpec ): """Initializes a Transformer model specification. Args: encoder: The encoder specification. decoder: The decoder specification. """ if not isinstance(encoder, TransformerEncoderSpec): raise TypeError("encoder argument must be a TransformerEncoderSpec") if not isinstance(decoder, TransformerDecoderSpec): raise TypeError("decoder argument must be a TransformerDecoderSpec") super().__init__() self.encoder = encoder self.decoder = decoder self._config.add_attribute( "multi_query_attention", self.encoder.multi_query_attention ) @classmethod def from_config( cls, num_layers: Union[int, Tuple[int, int]], num_heads: int, with_relative_position: bool = False, pre_norm: bool = True, no_final_norm: bool = False, activation: common_spec.Activation = common_spec.Activation.RELU, alignment_layer: int = -1, alignment_heads: int = 1, num_source_embeddings: int = 1, embeddings_merge: common_spec.EmbeddingsMerge = common_spec.EmbeddingsMerge.CONCAT, layernorm_embedding: bool = False, relative_attention_bias: bool = False, ffn_glu: bool = False, rms_norm: bool = False, multi_query_attention: bool = False, ): """Creates a Transformer model specification. Args: num_layers: Number of encoder and decoder layers, or a 2-tuple if the number is different. num_heads: Number of attention heads. with_relative_position: Use relative position representations in the self-attention layers as described in https://arxiv.org/abs/1803.02155. pre_norm: Enable the pre-norm Transformer architecture. no_final_norm: Disable the final layer norm in the pre-norm architecture. activation: Activation to apply in the feed-forward network. alignment_layer: Layer index selected for alignment. alignment_heads: Number of attention heads selected for alignment. num_source_embeddings: Number of source embeddings. embeddings_merge: When :obj:`num_source_embeddings` > 1, specify how the embeddings are merged. layernorm_embedding: Apply layer normalization after the embedding layer. relative_attention_bias: Use relative attention bias in the self-attention layers as described in the T5 paper https://arxiv.org/abs/1910.10683. ffn_glu: Use gated linear units in the FFN layer as described in https://arxiv.org/abs/2002.05202. rms_norm: Use the root mean square layer normalization. multi_query_attention: Use multi-query attention. """ if isinstance(num_layers, (list, tuple)): num_encoder_layers, num_decoder_layers = num_layers else: num_encoder_layers, num_decoder_layers = num_layers, num_layers encoder = TransformerEncoderSpec( num_encoder_layers, num_heads, pre_norm=pre_norm, no_final_norm=no_final_norm, activation=activation, num_source_embeddings=num_source_embeddings, embeddings_merge=embeddings_merge, layernorm_embedding=layernorm_embedding, relative_position=with_relative_position, relative_attention_bias=relative_attention_bias, ffn_glu=ffn_glu, rms_norm=rms_norm, multi_query_attention=multi_query_attention, ) decoder = TransformerDecoderSpec( num_decoder_layers, num_heads, pre_norm=pre_norm, no_final_norm=no_final_norm, activation=activation, layernorm_embedding=layernorm_embedding, relative_position=with_relative_position, relative_attention_bias=relative_attention_bias, alignment_layer=alignment_layer, alignment_heads=alignment_heads, ffn_glu=ffn_glu, rms_norm=rms_norm, multi_query_attention=multi_query_attention, ) return cls(encoder, decoder) @property def name(self): return "TransformerSpec" @property def revision(self): return 7 def get_default_config(self): return TransformerConfig() def get_source_vocabulary_size(self): return [spec.weight.shape[0] for spec in self.encoder.embeddings] def get_target_vocabulary_size(self): return self.decoder.embeddings.weight.shape[0] class TransformerDecoderModelConfig(model_spec.LanguageModelConfig): """Configuration for Transformer decoder models.""" def __init__(self, layer_norm_epsilon: Optional[float] = None, **kwargs): """Initializes the configuration for Transformer decoder models. Args: layer_norm_epsilon: The layer norm epsilon value. **kwargs: Additional configuration. """ super().__init__(layer_norm_epsilon=layer_norm_epsilon, **kwargs) class TransformerDecoderModelSpec(model_spec.LanguageModelSpec): """Describes a Transformer decoder model (e.g. GPT-2).""" def __init__(self, decoder: TransformerDecoderSpec): """Initializes a Transformer decoder model specification. Args: decoder: The decoder specification. """ if not isinstance(decoder, TransformerDecoderSpec): raise TypeError("decoder argument must be a TransformerDecoderSpec") super().__init__() self.decoder = decoder for key, value in self.decoder.config.items(): self._config.add_attribute(key, value) @classmethod def from_config( cls, num_layers: int, num_heads: int, pre_norm: bool = True, activation: common_spec.Activation = common_spec.Activation.RELU, layernorm_embedding: bool = False, no_final_norm: bool = False, project_in_out: bool = False, with_relative_position: bool = False, ffn_glu: bool = False, rms_norm: bool = False, alibi: bool = False, alibi_use_positive_positions: bool = False, scale_alibi: bool = False, rotary_dim: Optional[int] = None, rotary_interleave: bool = True, rotary_scaling_type: Optional[attention_spec.RotaryScalingType] = None, rotary_scaling_factor: float = 1, rotary_base: float = 10000, original_max_position_embeddings: int = 0, max_position_embeddings: int = 0, parallel_residual: bool = False, shared_layer_norm: bool = False, pre_post_layer_norm: bool = False, multi_query_attention: bool = False, num_heads_kv: Optional[int] = None, head_dim: Optional[int] = None, sliding_window: Optional[int] = None, quant_type: Optional[common_spec.Quantization] = None, quant_group_size: Optional[int] = None, quant_bits: Optional[int] = None, ): """Creates a Transformer decoder model specification. Args: num_layers: Number of decoder layers. num_heads: Number of attention heads. pre_norm: Enable the pre-norm Transformer architecture. activation: Activation to apply in the feed-forward network. layernorm_embedding: Apply layer normalization after the embedding layer. no_final_norm: Do not apply layer normalization after the last decoder block. project_in_out: Add a linear layer after the embedding layer and another one before the final output projection. with_relative_position: Enable relative position representations modules. ffn_glu: Use gated linear units in the FFN layers as described in https://arxiv.org/abs/2002.05202. rms_norm: Use the root mean square layer normalization. alibi: Use attention with linear biases. alibi_use_positive_positions: Use positive positions in the ALiBi definition. scale_alibi: Apply the dot product scale factor to ALiBi. rotary_dim: Apply rotary embeddings to these first N dimensions. If 0, rotary embeddings are applied to all dimensions. rotary_interleave: Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half. rotary_scaling_type: Type of RoPE scaling. rotary_scaling_factor: Factor used in the RoPE scaling. rotary_base: The base period of the rotary embeddings. original_max_position_embeddings: The original max position embeddings for Su rope embeddings max_position_embeddings: The max position embeddings for Su rope embeddings parallel_residual: Use parallel residual connections in each layer block, as used by the GPT-J and GPT-NeoX models. shared_layer_norm: When using parallel residual, share the input and post attention layer norms. pre_post_layer_norm: add post layer norm for each pre norm layer multi_query_attention: Use multi-query attention (alias for num_heads_kv=1). num_heads_kv: Number of attention heads for the key and value. head_dim: Number of head sliding_window: max sequence length to retain KV cache quant_type: quantization type used (like awq... for lower bit quantization) quant_group_size: group size of the lower bit quantization quant_bits: number of bit of the quantization (ex: 4bit) """ decoder = TransformerDecoderSpec( num_layers, num_heads, pre_norm=pre_norm, activation=activation, layernorm_embedding=layernorm_embedding, with_encoder_attention=False, no_final_norm=no_final_norm, project_in_out=project_in_out, relative_position=with_relative_position, ffn_glu=ffn_glu, rms_norm=rms_norm, alibi=alibi, alibi_use_positive_positions=alibi_use_positive_positions, scale_alibi=scale_alibi, rotary_dim=rotary_dim, rotary_interleave=rotary_interleave, rotary_scaling_type=rotary_scaling_type, rotary_scaling_factor=rotary_scaling_factor, rotary_base=rotary_base, original_max_position_embeddings=original_max_position_embeddings, max_position_embeddings=max_position_embeddings, parallel_residual=parallel_residual, shared_layer_norm=shared_layer_norm, pre_post_layer_norm=pre_post_layer_norm, multi_query_attention=multi_query_attention, num_heads_kv=num_heads_kv, head_dim=head_dim, sliding_window=sliding_window, quant_type=quant_type, quant_group_size=quant_group_size, quant_bits=quant_bits, ) return cls(decoder) @property def name(self): return "TransformerDecoderSpec" @property def revision(self): return 8 def get_default_config(self): return TransformerDecoderModelConfig() def get_vocabulary_size(self): return self.decoder.embeddings.weight.shape[0] class TransformerEncoderModelConfig(model_spec.LanguageModelConfig): """Configuration for Transformer encoder models.""" def __init__(self, layer_norm_epsilon: Optional[float] = None, **kwargs): """Initializes the configuration for Transformer encoder models. Args: layer_norm_epsilon: The layer norm epsilon value. **kwargs: Additional configuration. """ super().__init__(layer_norm_epsilon=layer_norm_epsilon, **kwargs) class TransformerEncoderModelSpec(model_spec.LanguageModelSpec): """Describes a Transformer encoder model (e.g. BERT).""" def __init__( self, encoder: TransformerEncoderSpec, pooling_layer: bool = False, pooling_activation: common_spec.Activation = common_spec.Activation.Tanh, ): """Initializes a Transformer encoder model specification. Args: encoder: The encoder specification. pooling_layer: Add the pooling layer. pooling_activation: The activation to apply after the pooling layer. """ if not isinstance(encoder, TransformerEncoderSpec): raise TypeError("encoder argument must be a TransformerEncoderSpec") super().__init__() self.encoder = encoder self._config.add_attribute( "multi_query_attention", self.encoder.multi_query_attention ) if pooling_layer: self.pooler_dense = common_spec.LinearSpec() self.pooler_activation = np.dtype("int8").type(pooling_activation) @property def name(self): return "TransformerEncoderSpec" @property def revision(self): return 1 def get_default_config(self): return TransformerEncoderModelConfig() def get_vocabulary_size(self): return self.encoder.embeddings[0].weight.shape[0]