# ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- import logging from typing import Optional, Union from fusion_attention import FusionAttention from fusion_base import Fusion from onnx import FunctionProto, NodeProto, TensorProto, helper, numpy_helper from onnx_model import OnnxModel logger = logging.getLogger(__name__) class FusionRotaryAttention(FusionAttention): """ Fuse Attention subgraph with rotary positional embeddings into one MultiHeadAttention node. """ def __init__( self, model: OnnxModel, hidden_size: int, num_heads: int, ): super().__init__( model, hidden_size, num_heads, use_multi_head_attention=True, search_op_types=[ "SimplifiedLayerNormalization", "SkipSimplifiedLayerNormalization", "LayerNormalization", "SkipLayerNormalization", "Add", ], ) def create_mha_node( self, input: str, output: str, q_rotary: NodeProto, k_rotary: NodeProto, v_matmul: NodeProto, attn_mask: str = "", add_qk: str = "", past_k: str = "", past_v: str = "", present_k: str = "", present_v: str = "", scale: Optional[float] = None, ) -> Union[NodeProto, None]: assert self.num_heads > 0 if self.hidden_size > 0 and (self.hidden_size % self.num_heads) != 0: logger.debug( f"fuse_rotary_attention: input hidden size {self.hidden_size} is not a multiple of num of heads {self.num_heads}" ) return None mha_node_name = self.model.create_node_name("MultiHeadAttention") mha_inputs = [ q_rotary.output[0], k_rotary.output[0], v_matmul.output[0], "", # bias attn_mask, # key_padding_mask add_qk, # attention_bias past_k, past_v, ] mha_outputs = [output] if present_k and present_v: mha_outputs.extend([present_k, present_v]) mha_node = helper.make_node( "MultiHeadAttention", inputs=mha_inputs, outputs=mha_outputs, name=mha_node_name, ) mha_node.domain = "com.microsoft" mha_node.attribute.extend([helper.make_attribute("num_heads", self.num_heads)]) if scale is not None: mha_node.attribute.extend([helper.make_attribute("scale", scale)]) if self.mask_filter_value is not None: mha_node.attribute.extend([helper.make_attribute("mask_filter_value", float(self.mask_filter_value))]) self.increase_counter("MultiHeadAttention") return mha_node def check_runtime_shape_paths_for_function( self, reshape_qkv_2, # Reshape after Transpose reshape_qkv_1, # Reshape before Transpose reshape_q_2, # Reshape after RotaryEmbedding reshape_k_2, # Reshape after RotaryEmbedding reshape_v_2, # Reshape after Transpose reshape_v_1, # Reshape before Transpose add_qk, # Add before Softmax root_input, # Root input to attention subgraph ): # Check #1: check paths for qkv nodes concat_qkv_2_path = self.model.match_parent_path(reshape_qkv_2, ["Concat"], [1]) concat_qkv_1_path = self.model.match_parent_path(reshape_qkv_1, ["Concat"], [1]) if concat_qkv_2_path is None or concat_qkv_1_path is None: return False concat_qkv_2, concat_qkv_1 = concat_qkv_2_path[0], concat_qkv_1_path[0] reshape_qkv_2_path_1 = self.model.match_parent_path(concat_qkv_2, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0]) reshape_qkv_2_path_2 = self.model.match_parent_path(concat_qkv_2, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0]) reshape_qkv_1_path_1 = self.model.match_parent_path(concat_qkv_1, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0]) reshape_qkv_1_path_2 = self.model.match_parent_path(concat_qkv_1, ["Unsqueeze", "Gather", "Shape"], [2, 0, 0]) if ( reshape_qkv_2_path_1 is None or reshape_qkv_2_path_2 is None or reshape_qkv_1_path_1 is None or reshape_qkv_1_path_2 is None ): return False _, gather_1, shape_1 = reshape_qkv_2_path_1 _, gather_2, shape_2 = reshape_qkv_2_path_2 # Check root_input --> Shape --> Gather connection if shape_1.input[0] != root_input or shape_2.input[0] != root_input: return False # Check Gather --> Unsqueeze --> Concat --> Reshape connection for reshape_qkv_1_path_1 and reshape_qkv_1_path_2 if reshape_qkv_1_path_1[1].name != gather_1.name or reshape_qkv_1_path_2[1].name != gather_2.name: return False # Check #2: check paths for v nodes concat_v_2_path = self.model.match_parent_path(reshape_v_2, ["Concat"], [1]) concat_v_1_path = self.model.match_parent_path(reshape_v_1, ["Concat"], [1]) if concat_v_2_path is None or concat_v_1_path is None: return False concat_v_2, concat_v_1 = concat_v_2_path[0], concat_v_1_path[0] reshape_v_2_path_1 = self.model.match_parent_path( concat_v_2, ["Unsqueeze", "Mul", "Gather", "Shape"], [0, 0, 0, 0] ) reshape_v_2_path_2 = self.model.match_parent_path( concat_v_2, ["Unsqueeze", "Add", "Gather", "Shape"], [1, 0, 0, 0] ) reshape_v_1_path_1 = self.model.match_parent_path(concat_v_1, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0]) reshape_v_1_path_2 = self.model.match_parent_path(concat_v_1, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0]) if ( reshape_v_2_path_1 is None or reshape_v_2_path_2 is None or reshape_v_1_path_1 is None or reshape_v_1_path_2 is None ): return False # Check Gather --> Mul --> Unsqueeze --> Concat --> Reshape connection for reshape_v_2_path_1 # Check Gather --> Add --> Unsqueeze --> Concat --> Reshape connection for reshape_v_2_path_2 # Check Gather --> Unsqueeze --> Concat --> Reshape connection for reshape_v_1_path_1 and reshape_v_1_path_2 if ( reshape_v_2_path_1[2].name != gather_1.name or reshape_v_2_path_2[2].name != gather_2.name or reshape_v_1_path_1[1].name != gather_1.name or reshape_v_1_path_2[1].name != gather_2.name ): return False # Check #3: check paths for k nodes concat_k_2_path = self.model.match_parent_path(reshape_k_2, ["Concat"], [1]) if concat_k_2_path is None: return False concat_k_2 = concat_k_2_path[0] reshape_k_2_path_1 = self.model.match_parent_path( concat_k_2, ["Unsqueeze", "Mul", "Gather", "Shape"], [0, 0, 0, 0] ) reshape_k_2_path_2 = self.model.match_parent_path( concat_k_2, ["Unsqueeze", "Add", "Gather", "Shape"], [2, 0, 0, 0] ) if reshape_k_2_path_1 is None or reshape_k_2_path_2 is None: return False # Check Gather --> Mul --> Unsqueeze --> Concat --> Reshape connection for reshape_k_2_path_1 # Check Gather --> Add --> Unsqueeze --> Concat --> Reshape connection for reshape_k_2_path_2 if reshape_k_2_path_1[2].name != gather_1.name or reshape_k_2_path_2[2].name != gather_2.name: return False # Check #4: check paths for q nodes concat_q_2_path = self.model.match_parent_path(reshape_q_2, ["Concat"], [1]) if concat_q_2_path is None: return False concat_q_2 = concat_q_2_path[0] reshape_q_2_path_1 = self.model.match_parent_path( concat_q_2, ["Unsqueeze", "Mul", "Gather", "Shape"], [0, 0, 0, 0] ) reshape_q_2_path_2 = self.model.match_parent_path(concat_q_2, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0]) if reshape_q_2_path_1 is None or reshape_q_2_path_2 is None: return False # Check Gather --> Mul --> Unsqueeze --> Concat --> Reshape connection for reshape_q_2_path_1 # Check Gather --> Unsqueeze --> Concat --> Reshape connection for reshape_q_2_path_2 if reshape_q_2_path_1[2].name != gather_1.name or reshape_q_2_path_2[1].name != gather_2.name: return False # Check #5: check Mul nodes are the same for q, k, v mul_q = reshape_q_2_path_1[1] mul_k = reshape_k_2_path_1[1] mul_v = reshape_v_2_path_1[1] gather_1_out = gather_1.output[0] if mul_q.input[0] != gather_1_out or mul_k.input[0] != gather_1_out or mul_v.input[0] != gather_1_out: return False # Check #6: check paths for attention mask nodes attn_mask_path_1 = self.model.match_parent_path(add_qk, ["Concat", "Slice", "Slice"], [1, 0, 0]) attn_mask_path_2 = self.model.match_parent_path(add_qk, ["Cast", "Concat", "Slice", "Slice"], [1, 0, 0, 0]) if attn_mask_path_1 is not None: _, slice_qk_2, slice_qk_1 = attn_mask_path_1 elif attn_mask_path_2 is not None: _, _, slice_qk_2, slice_qk_1 = attn_mask_path_2 else: return False # Check first input to Slice #1 is 3D attention mask of shape (B,S,T) if slice_qk_1.input[0] not in {"attn_mask", "attention_mask"}: return False slice_qk_2_path = self.model.match_parent_path( slice_qk_2, ["Unsqueeze", "Add", "Gather", "Shape"], [2, 0, 1, 0] ) slice_qk_1_path_1 = self.model.match_parent_path( slice_qk_1, ["Unsqueeze", "Add", "Gather", "Shape"], [2, 0, 1, 0] ) slice_qk_1_path_2 = self.model.match_parent_path(slice_qk_1, ["Unsqueeze"], [1]) if slice_qk_2_path is None or slice_qk_1_path_1 is None or slice_qk_1_path_2 is None: return False # Check Gather --> Add --> Unsqueeze #3 --> Slice #2 connection for slice_qk_2_path # Check Gather --> Add --> Unsqueeze #2 --> Slice #1 connection for slice_qk_1_path_1 if slice_qk_2_path[1].name != slice_qk_1_path_1[1].name or slice_qk_2_path[2].name != slice_qk_1_path_1[2].name: return False # Check Unsqueeze #1 --> Slice #1 connection for slice_qk_1_path_2 # Check if first input to Add and Unsqueeze #1 is position ids if slice_qk_1_path_1[1].input[0] != slice_qk_1_path_2[0].input[0]: return False return True def check_runtime_shape_paths_for_nodes( self, reshape_qkv, # Final reshape before o_proj MatMul reshape_q, # Reshape before q_proj MatMul reshape_k, # Reshape before k_proj MatMul reshape_v, # Reshape before v_proj MatMul root_input, # Root input to attention subgraph ): # Check #1: check paths for qkv nodes concat_qkv_path = self.model.match_parent_path(reshape_qkv, ["Concat"], [1]) if concat_qkv_path is None: return False concat_qkv = concat_qkv_path[0] reshape_qkv_path_1 = self.model.match_parent_path(concat_qkv, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0]) reshape_qkv_path_2 = self.model.match_parent_path(concat_qkv, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0]) if reshape_qkv_path_1 is None or reshape_qkv_path_2 is None: return False _, gather_1, shape_1 = reshape_qkv_path_1 _, gather_2, shape_2 = reshape_qkv_path_2 # Check root_input --> Shape --> Gather connection if shape_1.input[0] != root_input or shape_2.input[0] != root_input: return False # Check #2: check paths for v nodes concat_v_path = self.model.match_parent_path(reshape_v, ["Concat"], [1]) if concat_v_path is None: return False concat_v = concat_v_path[0] reshape_v_path_1 = self.model.match_parent_path(concat_v, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0]) reshape_v_path_2 = self.model.match_parent_path(concat_v, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0]) if reshape_v_path_1 is None or reshape_v_path_2 is None: return False # Check Gather --> Unsqueeze --> Concat --> Reshape connection if reshape_v_path_1[1].name != gather_1.name or reshape_v_path_2[1].name != gather_2.name: return False # Check #3: check paths for k nodes concat_k_path = self.model.match_parent_path(reshape_k, ["Concat"], [1]) if concat_k_path is None: return False concat_k = concat_k_path[0] reshape_k_path_1 = self.model.match_parent_path(concat_k, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0]) reshape_k_path_2 = self.model.match_parent_path(concat_k, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0]) if reshape_k_path_1 is None or reshape_k_path_2 is None: return False # Check Gather --> Unsqueeze --> Concat --> Reshape connection if reshape_k_path_1[1].name != gather_1.name or reshape_k_path_2[1].name != gather_2.name: return False # Check #4: check paths for q nodes concat_q_path = self.model.match_parent_path(reshape_q, ["Concat"], [1]) if concat_q_path is None: return False concat_q = concat_q_path[0] reshape_q_path_1 = self.model.match_parent_path(concat_q, ["Unsqueeze", "Gather", "Shape"], [0, 0, 0]) reshape_q_path_2 = self.model.match_parent_path(concat_q, ["Unsqueeze", "Gather", "Shape"], [1, 0, 0]) if reshape_q_path_1 is None or reshape_q_path_2 is None: return False # Check Gather --> Unsqueeze --> Concat --> Reshape connection if reshape_q_path_1[1].name != gather_1.name or reshape_q_path_2[1].name != gather_2.name: return False return True def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): if normalize_node.op_type not in {"SkipSimplifiedLayerNormalization", "SkipLayerNormalization", "Add"}: return # qkv_nodes_1 is for LLaMA-2 Microsoft # qkv_nodes_2 is for LLaMA-2 Hugging Face # qkv_nodes_3 is for LLaMA-2 distribute Hugging Face model qkv_nodes = None qkv_nodes_1 = self.model.match_parent_path( normalize_node, ["MatMul", "Reshape", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 0, 0], ) qkv_nodes_2 = self.model.match_parent_path( normalize_node, ["MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0], ) qkv_nodes_3 = self.model.match_parent_path( normalize_node, ["AllReduce", "MatMul", "Reshape", "Transpose", "MatMul"], [1, 0, 0, 0, 0], ) if qkv_nodes_1 is not None: _, reshape_qkv_2, _, reshape_qkv_1, matmul_qkv = qkv_nodes_1 qkv_nodes = qkv_nodes_1 elif qkv_nodes_2 is not None: _, reshape_qkv, _, matmul_qkv = qkv_nodes_2 qkv_nodes = qkv_nodes_2 elif qkv_nodes_3 is not None: _, _, reshape_qkv, _, matmul_qkv = qkv_nodes_3 qkv_nodes = qkv_nodes_3 else: logger.debug("fuse_rotary_attention: failed to match qkv nodes") return # v_nodes_1 is for LLaMA-2 Microsoft # v_nodes_3 is for LLaMA-2 Hugging Face # v_nodes_4 is for LLaMA-2 70B model # v_nodes_5 is for Phi-2 DirectML past_v, present_v, past_seq_len = "", "", "" v_nodes = None add_v = None v_nodes_1 = self.model.match_parent_path( matmul_qkv, ["Reshape", "Transpose", "Concat", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 1, 0, 0], ) v_nodes_2 = self.model.match_parent_path( matmul_qkv, ["Concat", "Transpose", "Reshape", "MatMul"], [1, 1, 0, 0], ) v_nodes_3 = self.model.match_parent_path( matmul_qkv, ["Transpose", "Reshape", "MatMul"], [1, 0, 0], ) _, v_nodes_4, _ = self.model.match_parent_paths_all( matmul_qkv, [ ( ["Reshape", "Expand", "Unsqueeze", "Concat", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 0, 1, 0, 0], ), ( [ "Reshape", "Expand", "Where", "Equal", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], ), ( [ "Reshape", "Expand", "Where", "Equal", "Mul", "ConstantOfShape", "Shape", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0], ), ( [ "Reshape", "Expand", "Where", "ConstantOfShape", "Shape", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0], ), ( [ "Reshape", "Expand", "Where", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 2, 0, 4, 0, 0, 0, 1, 0, 0], ), ( ["Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul"], [1, 1, 0, 0, 0, 0, 1, 0, 0], ), ( [ "Reshape", "Concat", "Unsqueeze", "Mul", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul", ], [1, 1, 1, 0, 0, 0, 0, 1, 0, 0], ), ( ["Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul"], [1, 1, 2, 0, 0, 0, 1, 0, 0], ), ( ["Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "Transpose", "Reshape", "MatMul"], [1, 1, 3, 0, 0, 0, 1, 0, 0], ), ], output_name_to_node=None, ) v_nodes_5 = self.model.match_parent_path( matmul_qkv, ["Concat", "Transpose", "Reshape", "Add", "MatMul"], [1, 1, 0, 0, 1], ) if v_nodes_1 is not None: reshape_v_2, _, concat_v, _, reshape_v_1, matmul_v = v_nodes_1 v_nodes = v_nodes_1 concat_v_path = self.model.match_parent_path( concat_v, ["Slice", "Unsqueeze"], [0, 2], ) if concat_v_path is None: logger.debug("fuse_rotary_attention: failed to match past/present concat in v path") return past_v = concat_v_path[0].input[0] past_seq_len = concat_v_path[-1].input[0] present_v = concat_v.output[0] elif v_nodes_2 is not None: concat_v, transpose_v, reshape_v, matmul_v = v_nodes_2 v_nodes = v_nodes_2 past_v = concat_v.input[0] present_v = concat_v.output[0] elif v_nodes_3 is not None: transpose_v, reshape_v, matmul_v = v_nodes_3 v_nodes = v_nodes_3 present_v = transpose_v.output[0] elif v_nodes_4 is not None and len(v_nodes_4) == 9: concat_v, transpose_v, reshape_v, matmul_v = v_nodes_4[0][-4:] v_nodes = v_nodes_4 past_v = concat_v.input[0] present_v = concat_v.output[0] elif v_nodes_5 is not None: concat_v, transpose_v, reshape_v, add_v, matmul_v = v_nodes_5 matmul_v = add_v v_nodes = v_nodes_5 past_v = concat_v.input[0] present_v = concat_v.output[0] else: logger.debug("fuse_rotary_attention: failed to match v path") return qk_nodes = self.model.match_parent_path( matmul_qkv, ["Softmax", "Add", "Div", "MatMul"], [0, 0, 0, 0], ) add_qk, matmul_qk = None, None if qk_nodes is not None: _, add_qk, _, matmul_qk = qk_nodes else: logger.debug("fuse_rotary_attention: failed to match qk nodes") return # attn_mask_nodes_1, attn_mask_nodes_2 are for LLaMA-2 Microsoft's 3D attention mask # attn_mask_nodes_3, attn_mask_nodes_4 are for LLaMA-2 Hugging Face's 2D attention mask # attn_mask_nodes_5, attn_mask_nodes_6 are for LLaMA-2 Microsoft's model for the DML EP # attn_mask_nodes_7 is for LLaMA-2 Hugging Face's changes to the attention mask attn_mask, add_qk_str = "", "" attn_mask_nodes_1 = self.model.match_parent_path( add_qk, ["Concat", "Slice", "Slice"], [1, 0, 0], ) attn_mask_nodes_2 = self.model.match_parent_path( add_qk, ["Cast", "Concat", "Slice", "Slice"], [1, 0, 0, 0], ) attn_mask_nodes_3 = self.model.match_parent_path( add_qk, ["Add", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"], [1, 0, 2, 1, 0, 0, 0], ) attn_mask_nodes_4 = self.model.match_parent_path( add_qk, ["Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"], [1, 2, 1, 0, 0, 0], ) attn_mask_nodes_5 = self.model.match_parent_path( add_qk, ["Expand", "Add", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"], [1, 0, 0, 2, 1, 0, 0, 0], ) attn_mask_nodes_6 = self.model.match_parent_path( add_qk, ["Expand", "Where", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"], [1, 0, 2, 1, 0, 0, 0], ) attn_mask_nodes_7 = self.model.match_parent_path( add_qk, ["Where", "Cast", "Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"], [1, 0, 0, 0, 0, 1, 0, 0, 0], ) if attn_mask_nodes_1 is not None: _, slice_mask_1, slice_mask_2 = attn_mask_nodes_1 attn_mask = slice_mask_1.output[0] elif attn_mask_nodes_2 is not None: _, _, slice_mask_1, slice_mask_2 = attn_mask_nodes_2 attn_mask = slice_mask_1.output[0] elif attn_mask_nodes_3 is not None: # Reshape from (B,1,S,T) to (B,N,S,T) add_qk_str = self.reshape_add_qk(attn_mask_nodes_3[0].output[0]) elif attn_mask_nodes_4 is not None: # Reshape from (B,1,S,T) to (B,N,S,T) add_qk_str = self.reshape_add_qk(attn_mask_nodes_4[0].output[0]) elif attn_mask_nodes_5 is not None: # The mask has already been reshaped to (B,N,S,T) add_qk_str = attn_mask_nodes_5[0].output[0] elif attn_mask_nodes_6 is not None: # The mask has already been reshaped to (B,N,S,T) add_qk_str = attn_mask_nodes_6[0].output[0] elif attn_mask_nodes_7 is not None: # Reshape from (B,1,S,T) to (B,N,S,T) add_qk_str = self.reshape_add_qk(attn_mask_nodes_7[0].output[0]) else: logger.debug("fuse_rotary_attention: failed to match attention mask nodes") return # k_nodes_1 is for LLaMA-2 Microsoft # k_nodes_2 is for LLaMA-2 Hugging Face # k_nodes_4 is for LLaMA-2 70B Hugging Face past_k, present_k = "", "" k_nodes = None slice_k = None concat_k_half = None k_nodes_1 = self.model.match_parent_path( matmul_qk, ["Reshape", "Transpose", "Concat", "Transpose", "RotaryEmbedding", "MatMul"], [1, 0, 0, 1, 0, 0], ) k_nodes_2 = self.model.match_parent_path( matmul_qk, ["Transpose", "RotaryEmbedding", "Transpose", "Reshape", "MatMul"], [1, 0, 0, 0, 0], ) k_nodes_3 = self.model.match_parent_path( matmul_qk, ["Transpose", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul"], [1, 0, 1, 0, 0, 0], ) _, k_nodes_4, _ = self.model.match_parent_paths_all( matmul_qk, [ ( [ "Transpose", "Reshape", "Expand", "Unsqueeze", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Expand", "Where", "Equal", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Expand", "Where", "Equal", "Mul", "ConstantOfShape", "Shape", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Expand", "Where", "ConstantOfShape", "Shape", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 0, 1, 1, 0, 0, 0, 3, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Expand", "Where", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 0, 1, 2, 0, 4, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Concat", "Unsqueeze", "Mul", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0], ), ( [ "Transpose", "Reshape", "Concat", "Unsqueeze", "Gather", "Shape", "Concat", "RotaryEmbedding", "Transpose", "Reshape", "MatMul", ], [1, 0, 1, 3, 0, 0, 0, 1, 0, 0, 0], ), ], output_name_to_node=None, ) k_nodes_5 = self.model.match_parent_path( matmul_qk, ["Transpose", "Concat", "Concat", "RotaryEmbedding", "Slice", "Transpose", "Reshape", "Add", "MatMul"], [1, 0, 1, 0, 0, 0, 0, 0, 1], ) if k_nodes_1 is not None: reshape_k_2, _, concat_k, _, rotary_k, matmul_k = k_nodes_1 k_nodes = k_nodes_1 concat_k_path = self.model.match_parent_path( concat_k, ["Slice", "Unsqueeze"], [0, 2], ) if concat_k_path is None: logger.debug("fuse_rotary_attention: failed to match past/present concat in k path") return past_k = concat_k_path[0].input[0] shared_past_seq_len = concat_k_path[-1].input[0] present_k = concat_k.output[0] assert past_seq_len == shared_past_seq_len elif k_nodes_2 is not None: _, rotary_k, _, reshape_k, matmul_k = k_nodes_2 k_nodes = k_nodes_2 present_k = rotary_k.output[0] elif k_nodes_3 is not None: _, concat_k, rotary_k, _, reshape_k, matmul_k = k_nodes_3 k_nodes = k_nodes_3 past_k = concat_k.input[0] present_k = concat_k.output[0] elif k_nodes_4 is not None and len(k_nodes_4) == 9: reshape_k, matmul_k = k_nodes_4[0][-2:] concat_k, rotary_k = k_nodes_4[0][-5:-3] k_nodes = k_nodes_4 past_k = concat_k.input[0] present_k = concat_k.output[0] elif k_nodes_5 is not None: _, concat_k, concat_k_half, rotary_k, slice_k, _, reshape_k, _, matmul_k = k_nodes_5 k_nodes = k_nodes_5 past_k = concat_k.input[0] present_k = concat_k.output[0] else: logger.debug("fuse_rotary_attention: failed to match k nodes") return # q_nodes_1 is for LLaMA-2 Microsoft # q_nodes_2 is for LLaMA-2 Hugging Face # q_nodes_3 is for Phi-2 DirectML q_nodes = None slice_q = None concat_q_half = None q_nodes_1 = self.model.match_parent_path( matmul_qk, ["Reshape", "Transpose", "RotaryEmbedding", "MatMul"], [0, 0, 0, 0], ) q_nodes_2 = self.model.match_parent_path( matmul_qk, ["RotaryEmbedding", "Transpose", "Reshape", "MatMul"], [0, 0, 0, 0], ) q_nodes_3 = self.model.match_parent_path( matmul_qk, ["Concat", "RotaryEmbedding", "Slice", "Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, 0, 0, 0, 1], ) if q_nodes_1 is not None: reshape_q_2, _, rotary_q, matmul_q = q_nodes_1 q_nodes = q_nodes_1 elif q_nodes_2 is not None: rotary_q, _, reshape_q, matmul_q = q_nodes_2 q_nodes = q_nodes_2 elif q_nodes_3 is not None: concat_q_half, rotary_q, slice_q, _, reshape_q, _, matmul_q = q_nodes_3 q_nodes = q_nodes_3 else: logger.debug("fuse_rotary_attention: failed to match q nodes") return if matmul_q.input[0] != matmul_k.input[0] and matmul_k.input[0] != matmul_v.input[0]: logger.debug("fuse_rotary_attention: failed to find the same root_input for q, k, v paths") return root_output = "" if qkv_nodes == qkv_nodes_1: if not self.check_runtime_shape_paths_for_function( reshape_qkv_2, reshape_qkv_1, reshape_q_2, reshape_k_2, reshape_v_2, reshape_v_1, add_qk, matmul_q.input[0], ): logger.debug("fuse_rotary_attention: failed to verify runtime shape paths") return root_output = reshape_qkv_2.output[0] elif qkv_nodes in (qkv_nodes_2, qkv_nodes_3): if not self.check_runtime_shape_paths_for_nodes( reshape_qkv, reshape_q, reshape_k, reshape_v, matmul_q.input[0], ): logger.debug("fuse_rotary_attention: failed to verify runtime shape paths") return root_output = reshape_qkv.output[0] # Rename inputs of rotary_q/k so it connects with output of matmul_q/k # Before: MatMul --> Reshape --> Transpose --> RotaryEmbedding # After: MatMul --> RotaryEmbedding rotary_q.input[0] = slice_q.output[0] if slice_q else matmul_q.output[0] rotary_k.input[0] = slice_k.output[0] if slice_k else matmul_k.output[0] # Rename current output of rotary_k (present_key) so it doesn't match output of MHA (present_key) if concat_q_half is None: rotary_k.output[0] = rotary_k.name + "_output_0" if qkv_nodes == qkv_nodes_3: qkv_nodes = qkv_nodes[1:] def create_hidden_size_concat_node(reshape_q): """Detect num_heads and hidden_size for ONNX model from phi-2 Args: reshape_q (NodeProto): reshape node for q Returns: hidden_size_concat_node(NodeProto): Concat node to be used by reshape """ concat = self.model.match_parent(reshape_q, "Concat", 1) if concat is None: logger.debug("fuse_rotary_attention: failed to trace the concat node from reshape_q") return None # The shape is a tensor like [?, ?, num_heads, head_size] num_head_constant_node = self.model.get_constant_value(concat.input[2]) head_size_constant_node = self.model.get_constant_value(concat.input[3]) if num_head_constant_node is None or head_size_constant_node is None: logger.debug("fuse_rotary_attention: failed to get constant nodes of num_heads or head_size") return None num_head_value = num_head_constant_node[0] head_size_value = head_size_constant_node[0] hidden_size = num_head_value * head_size_value hidden_size_initilizer = self.model.create_node_name("Initializer", name_prefix="hidden_size") if self.model.get_initializer(hidden_size_initilizer) is None: self.add_initializer( name=hidden_size_initilizer, data_type=TensorProto.INT64, dims=[1], vals=[hidden_size], raw=False, ) hidden_size_reshape_node_name = self.model.create_node_name("Concat", name_prefix="hidden_size_concat") hidden_size_concat_node = helper.make_node( "Concat", inputs=[ concat.input[0], concat.input[1], hidden_size_initilizer, ], outputs=[hidden_size_reshape_node_name + "output_0"], name=hidden_size_reshape_node_name, ) hidden_size_concat_node.attribute.extend([helper.make_attribute("axis", 0)]) return hidden_size_concat_node # Add Tranpose and Reshape nodes for patial rotary embedding applied in phi-2 before passing into MHA if concat_q_half and concat_k_half: # Transpose the key output of rotary Embedding k_transpose_node_name = self.model.create_node_name("Transpose") k_tranpose_output_name = k_transpose_node_name + "_output_0" k_transpose_node = helper.make_node( "Transpose", inputs=[concat_k_half.output[0]], outputs=[k_tranpose_output_name], name=k_transpose_node_name, ) k_transpose_node.attribute.extend([helper.make_attribute("perm", [0, 2, 1, 3])]) # Transpose the query output of rotary Embedding q_transpose_node_name = self.model.create_node_name("Transpose") q_tranpose_output_name = q_transpose_node_name + "_output_0" q_transpose_node = helper.make_node( "Transpose", inputs=[concat_q_half.output[0]], outputs=[q_tranpose_output_name], name=q_transpose_node_name, ) q_transpose_node.attribute.extend([helper.make_attribute("perm", [0, 2, 1, 3])]) hidden_size_concat_node = create_hidden_size_concat_node(reshape_k) if hidden_size_concat_node is None: logger.debug("fuse_rotary_attention: failed to create hidden_size_concat_node") return # Reshape the Rotary Embedding output for key for 4D to 3D concat_k_reshape_node_name = self.model.create_node_name("Reshape", name_prefix="concat_k_half") concat_k_reshape_node = helper.make_node( "Reshape", inputs=[k_transpose_node.output[0], hidden_size_concat_node.output[0]], outputs=[concat_k_reshape_node_name + "_output_0"], name=concat_k_reshape_node_name, ) # Reshape the Rotary Embedding output for query from 4D to 3D concat_q_reshape_node_name = self.model.create_node_name("Reshape", name_prefix="concat_q_half") concat_q_reshape_node = helper.make_node( "Reshape", inputs=[q_transpose_node.output[0], hidden_size_concat_node.output[0]], outputs=[concat_q_reshape_node_name + "_output_0"], name=concat_q_reshape_node_name, ) rotary_k = concat_k_reshape_node rotary_q = concat_q_reshape_node self.nodes_to_add.append(hidden_size_concat_node) self.nodes_to_add.append(k_transpose_node) self.nodes_to_add.append(q_transpose_node) self.nodes_to_add.append(concat_k_reshape_node) self.nodes_to_add.append(concat_q_reshape_node) self.node_name_to_graph_name[hidden_size_concat_node.name] = self.this_graph_name self.node_name_to_graph_name[k_transpose_node.name] = self.this_graph_name self.node_name_to_graph_name[q_transpose_node.name] = self.this_graph_name self.node_name_to_graph_name[concat_k_reshape_node.name] = self.this_graph_name self.node_name_to_graph_name[concat_q_reshape_node.name] = self.this_graph_name new_node = self.create_mha_node( matmul_q.input[0], root_output, rotary_q, rotary_k, matmul_v, attn_mask, add_qk_str, past_k, past_v, present_k, present_v, ) if new_node is None: logger.debug("fuse_rotary_attention: failed to create multi-head attention with rotary embeddings") return self.nodes_to_add.append(new_node) self.node_name_to_graph_name[new_node.name] = self.this_graph_name self.nodes_to_remove.extend(qkv_nodes[1:]) if v_nodes != v_nodes_4: self.nodes_to_remove.extend(v_nodes[:-1] if add_v is None else v_nodes[:-2]) else: nodes_to_keep = [v_nodes[0][-1]] for temp_path in v_nodes: self.add_nodes_to_remove_with_nodes_to_keep(temp_path, nodes_to_keep) self.nodes_to_remove.extend(qk_nodes) if k_nodes == k_nodes_1: self.nodes_to_remove.extend(k_nodes[:-2]) elif k_nodes == k_nodes_2: self.nodes_to_remove.append(k_nodes[0]) self.nodes_to_remove.append(k_nodes[2]) self.nodes_to_remove.append(k_nodes[3]) elif k_nodes == k_nodes_3: self.nodes_to_remove.append(k_nodes[0]) self.nodes_to_remove.append(k_nodes[1]) self.nodes_to_remove.append(k_nodes[3]) self.nodes_to_remove.append(k_nodes[4]) elif k_nodes == k_nodes_5: self.nodes_to_remove.append(k_nodes[0]) self.nodes_to_remove.append(k_nodes[1]) elif k_nodes == k_nodes_4: nodes_to_keep = [k_nodes[0][-1], k_nodes[0][-4]] for temp_path in k_nodes: self.add_nodes_to_remove_with_nodes_to_keep(temp_path, nodes_to_keep) if q_nodes == q_nodes_1: self.nodes_to_remove.extend(q_nodes[:-2]) elif q_nodes == q_nodes_2: self.nodes_to_remove.append(q_nodes[1]) self.nodes_to_remove.append(q_nodes[2]) self.prune_graph = True class FusionRotaryEmbeddings(Fusion): def __init__(self, model: OnnxModel): self.base_name = "RotaryEmbedding" super().__init__(model, self.base_name, [self.base_name, self.base_name + ".1", "Add"]) # The RotaryEmbedding function can have multiple extraneous constant outputs even though the function is supposed to produce only one output. # This is a byproduct of a potential CSE bug when using `export_modules_as_functions` in the TorchScript exporter. # To work around this issue, we set the extraneous constant values from the RotaryEmbedding function as initializers in the locations where they are actually used. def reassign_extra_outputs(self, rot_emb_node: NodeProto, function: FunctionProto): # Find extra outputs and Constant nodes attached to those outputs extra_constants, extra_outputs = [], [] for fn_node in function.node: if fn_node.op_type == "Constant" and fn_node.input == [] and fn_node.output[0] in function.output: extra_constants.append(fn_node) output_index = list(function.output).index(fn_node.output[0]) extra_outputs.append(rot_emb_node.output[output_index]) # Set extra Constant node outputs as initializers extra_initializers = [] for extra_constant in extra_constants: constant_tensorproto = extra_constant.attribute[0].t constant_tensorproto.name = self.model.create_node_name("Constant") self.model.add_initializer(constant_tensorproto) extra_initializers.append(constant_tensorproto.name) # Update references of Constant node outputs to initializer references for extra_output, extra_initializer in zip(extra_outputs, extra_initializers): nodes_to_update = list(filter(lambda entry: extra_output in entry.input, self.model.model.graph.node)) for node_to_update in nodes_to_update: OnnxModel.replace_node_input(node_to_update, extra_output, extra_initializer) return extra_outputs def create_rotary_embeddings_from_function(self, node: NodeProto): rotary_emb_node_name = self.model.create_node_name(self.base_name) matmul_path = self.model.match_parent_path( node, ["Reshape", "MatMul"], [0, 0], ) if matmul_path is not None: reshape_node, matmul_node = matmul_path else: logger.debug("fuse_rotary_embeddings: failed to match MatMul") return rotary_emb_inputs = [ matmul_node.output[0], # x is of shape (B,S,D) instead of (B,S,N,H) node.input[1], # position_ids ] # Convert cos_cache and sin_cache from node attributes to model initializers cos_cache_node = list(filter(lambda constant: constant.output[0] == node.input[2], self.model.model.graph.node)) sin_cache_node = list(filter(lambda constant: constant.output[0] == node.input[3], self.model.model.graph.node)) cos_cache_name, sin_cache_name = "cos_cache", "sin_cache" if ( len(cos_cache_node) == 1 and len(sin_cache_node) == 1 and self.model.get_initializer(cos_cache_name) is None and self.model.get_initializer(sin_cache_name) is None ): cos_cache = numpy_helper.to_array(cos_cache_node[0].attribute[0].t).squeeze() sin_cache = numpy_helper.to_array(sin_cache_node[0].attribute[0].t).squeeze() cos_cache_tensor = helper.make_tensor( name=cos_cache_name, data_type=TensorProto.FLOAT, dims=list(cos_cache.shape), vals=cos_cache.flatten().tolist(), ) self.model.add_initializer(cos_cache_tensor, self.this_graph_name) sin_cache_tensor = helper.make_tensor( name=sin_cache_name, data_type=TensorProto.FLOAT, dims=list(sin_cache.shape), vals=sin_cache.flatten().tolist(), ) self.model.add_initializer(sin_cache_tensor, self.this_graph_name) self.nodes_to_remove.extend([cos_cache_node[0], sin_cache_node[0]]) rotary_emb_inputs.extend([cos_cache_name, sin_cache_name]) rotary_emb_outputs = node.output if len(rotary_emb_outputs) > 1: # Re-assign extraneous constant outputs in RotaryEmbedding functions as initializers func = list(filter(lambda fn: fn.name == node.op_type, self.model.model.functions)) assert len(func) == 1 extra_outputs = self.reassign_extra_outputs(node, func[0]) rotary_emb_outputs = list(filter(lambda output_name: output_name not in extra_outputs, rotary_emb_outputs)) assert len(rotary_emb_outputs) == 1 rotary_emb_node = helper.make_node( self.base_name, inputs=rotary_emb_inputs, outputs=rotary_emb_outputs, name=rotary_emb_node_name, interleaved=1, ) rotary_emb_node.domain = "com.microsoft" self.nodes_to_remove.append(reshape_node) return rotary_emb_node def create_rotary_embeddings_from_nodes( self, root_input: str, position_ids: str, cos_slice: str, sin_slice: str, output: str, ): rotary_emb_node_name = self.model.create_node_name(self.base_name) # Convert cos_cache and sin_cache from node attributes to model initializers cos_cache_node = list(filter(lambda constant: constant.output[0] == cos_slice, self.model.model.graph.node)) sin_cache_node = list(filter(lambda constant: constant.output[0] == sin_slice, self.model.model.graph.node)) cos_cache_name, sin_cache_name = "cos_cache", "sin_cache" if ( len(cos_cache_node) == 1 and len(sin_cache_node) == 1 and self.model.get_initializer(cos_cache_name) is None and self.model.get_initializer(sin_cache_name) is None ): cos_cache = numpy_helper.to_array(cos_cache_node[0].attribute[0].t).squeeze() sin_cache = numpy_helper.to_array(sin_cache_node[0].attribute[0].t).squeeze() # Reshape cos/sin cache from (M, H) to (M, H/2) head_size = cos_cache.shape[1] cos_cache = cos_cache[:, : (head_size // 2)] sin_cache = sin_cache[:, : (head_size // 2)] cos_cache_tensor = helper.make_tensor( name=cos_cache_name, data_type=TensorProto.FLOAT, dims=list(cos_cache.shape), vals=cos_cache.flatten().tolist(), ) self.model.add_initializer(cos_cache_tensor, self.this_graph_name) sin_cache_tensor = helper.make_tensor( name=sin_cache_name, data_type=TensorProto.FLOAT, dims=list(sin_cache.shape), vals=sin_cache.flatten().tolist(), ) self.model.add_initializer(sin_cache_tensor, self.this_graph_name) self.nodes_to_remove.extend([cos_cache_node[0], sin_cache_node[0]]) rotary_emb_node = helper.make_node( self.base_name, inputs=[root_input, position_ids, cos_cache_name, sin_cache_name], outputs=[output], name=rotary_emb_node_name, interleaved=0, ) rotary_emb_node.domain = "com.microsoft" return rotary_emb_node def fuse(self, node, input_name_to_nodes, output_name_to_node): # Node is either RotaryEmbedding function or Add if self.base_name not in node.op_type and node.op_type != "Add": return # Check if node is "RotaryEmbedding nn.Module" exported as a function # (e.g. export_modules_as_functions={RotaryEmbedding} in torch.onnx.export) rotary_emb_node = None if node.op_type != "Add": # Verify that function has the correct inputs if len(node.input) not in {4, 5} or node.input[1] not in { "pos", "pos_id", "position_id", "pos_ids", "position_ids", }: logger.debug("fuse_rotary_embeddings: failed to verify inputs for RotaryEmbedding function") return rotary_emb_node = self.create_rotary_embeddings_from_function(node) if rotary_emb_node is None: logger.debug("fuse_rotary_embeddings: failed to create RotaryEmbedding node") return # Remove RotaryEmbedding function self.nodes_to_remove.append(node) # Remove RotaryEmbedding function's shape inference stored in value_info # The new shape will be calculated during symbolic shape inference old_shape_infer = list( filter(lambda node: node.name == rotary_emb_node.output[0], self.model.model.graph.value_info) ) assert len(old_shape_infer) == 1 self.model.model.graph.value_info.remove(old_shape_infer[0]) else: # Rotary embeddings are defined using the below functions: # # def rotate_half(x): # """Rotates half the hidden dims of the input.""" # x1 = x[..., : x.shape[-1] // 2] # x2 = x[..., x.shape[-1] // 2 :] # return torch.cat((-x2, x1), dim=-1) # # def apply_rope(x, cos, sin, position_ids): # cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] # sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] # cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] # sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] # x_embed = (x * cos) + (rotate_half(x) * sin) # return x_embed # Check paths for rotate_half(x) rotate_half_x2_path_1_1 = self.model.match_parent_path( node, ["Mul", "Concat", "Neg", "Slice", "Transpose"], [1, 0, 0, 0, 0], ) rotate_half_x2_path_1_2 = self.model.match_parent_path( node, ["Mul", "Concat", "Neg", "Slice", "Slice"], [1, 0, 0, 0, 0], ) rotate_half_x2_path_1 = rotate_half_x2_path_1_1 or rotate_half_x2_path_1_2 rotate_half_x2_path_2_1 = self.model.match_parent_path( node, ["Mul", "Concat", "Neg", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Transpose"], [1, 0, 0, 0, 1, 0, 0, 0, 0], ) rotate_half_x2_path_2_2 = self.model.match_parent_path( node, ["Mul", "Concat", "Neg", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Slice"], [1, 0, 0, 0, 1, 0, 0, 0, 0], ) rotate_half_x2_path_2 = rotate_half_x2_path_2_1 or rotate_half_x2_path_2_2 if rotate_half_x2_path_1 is None or rotate_half_x2_path_2 is None: logger.debug("fuse_rotary_embeddings: failed to match x2 in rotate_half") return rotate_half_x1_path_1_1 = self.model.match_parent_path( node, ["Mul", "Concat", "Slice", "Transpose"], [1, 0, 1, 0], ) rotate_half_x1_path_1_2 = self.model.match_parent_path( node, ["Mul", "Concat", "Slice", "Slice"], [1, 0, 1, 0], ) rotate_half_x1_path_1 = rotate_half_x1_path_1_1 or rotate_half_x1_path_1_2 rotate_half_x1_path_2_1 = self.model.match_parent_path( node, ["Mul", "Concat", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Transpose"], [1, 0, 1, 2, 0, 0, 0, 0], ) rotate_half_x1_path_2_2 = self.model.match_parent_path( node, ["Mul", "Concat", "Slice", "Unsqueeze", "Div", "Gather", "Shape", "Slice"], [1, 0, 1, 2, 0, 0, 0, 0], ) rotate_half_x1_path_2 = rotate_half_x1_path_2_1 or rotate_half_x1_path_2_2 if rotate_half_x1_path_1 is None or rotate_half_x1_path_2 is None: logger.debug("fuse_rotary_embeddings: failed to match x1 in rotate_half") return if ( rotate_half_x1_path_1[-1].name != rotate_half_x1_path_2[-1].name or rotate_half_x2_path_1[-1].name != rotate_half_x2_path_2[-1].name or rotate_half_x1_path_1[-1].name != rotate_half_x2_path_1[-1].name or rotate_half_x1_path_2[-1].name != rotate_half_x2_path_2[-1].name ): logger.debug("fuse_rotary_embeddings: failed to match common input in rotate_half") return # Check path for x x_path_1 = self.model.match_parent_path( node, ["Mul", "Transpose"], [0, 0], ) x_path_2 = self.model.match_parent_path( node, ["Mul", "Slice"], [0, 0], ) x_path = x_path_1 or x_path_2 if x_path is None: logger.debug("fuse_rotary_embeddings: failed to match x in rotate_half") return # Check path for sin sin_path, sin_cache, position_ids = None, "", "" sin_path_1 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Gather", "Shape"], [1, 1, 0, 0, 0, 0, 2, 0, 0], ) sin_path_2 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Add"], [1, 1, 0, 0, 0, 0, 2, 0], ) sin_path_3 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Gather", "Shape"], [1, 1, 0, 0, 2, 0, 0], ) sin_path_4 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Add"], [1, 1, 0, 0, 2, 0], ) if sin_path_1 is not None: sin_path = sin_path_1 sin_cache = sin_path[-4].input[0] elif sin_path_2 is not None: sin_path = sin_path_2 sin_cache = sin_path[-3].input[0] elif sin_path_3 is not None: sin_path = sin_path_3 sin_cache = sin_path[-4].input[0] position_ids = sin_path[2].input[1] elif sin_path_4 is not None: sin_path = sin_path_4 sin_cache = sin_path[-3].input[0] position_ids = sin_path[2].input[1] else: logger.debug("fuse_rotary_embeddings: failed to match sin path in apply_rope") return # Check path for cos cos_path, cos_cache = None, "" cos_path_1 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Gather", "Shape"], [0, 1, 0, 0, 0, 0, 2, 0, 0], ) cos_path_2 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Squeeze", "Squeeze", "Slice", "Unsqueeze", "Add"], [0, 1, 0, 0, 0, 0, 2, 0], ) cos_path_3 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Gather", "Shape"], [0, 1, 0, 0, 2, 0, 0], ) cos_path_4 = self.model.match_parent_path( node, ["Mul", "Unsqueeze", "Gather", "Slice", "Unsqueeze", "Add"], [0, 1, 0, 0, 2, 0], ) if cos_path_1 is not None: cos_path = cos_path_1 cos_cache = cos_path[-4].input[0] elif cos_path_2 is not None: cos_path = cos_path_2 cos_cache = cos_path[-3].input[0] elif cos_path_3 is not None: cos_path = cos_path_3 cos_cache = cos_path[-4].input[0] position_ids = cos_path[2].input[1] elif cos_path_4 is not None: cos_path = cos_path_4 cos_cache = cos_path[-3].input[0] position_ids = cos_path[2].input[1] else: logger.debug("fuse_rotary_embeddings: failed to match sin path in apply_rope") return # Check path for position ids if position_ids == "": position_ids_from_sin_path = self.model.match_parent_path( sin_path[2], ["Reshape"], [1], ) position_ids_from_cos_path = self.model.match_parent_path( cos_path[2], ["Reshape"], [1], ) if ( position_ids_from_sin_path is None or position_ids_from_cos_path is None or position_ids_from_sin_path[0].name != position_ids_from_cos_path[0].name ): logger.debug("fuse_rotary_embeddings: failed to match position ids path in apply_rope") return position_ids = position_ids_from_cos_path[0].input[0] else: position_ids_from_sin_path = [] position_ids_from_cos_path = [] past_seq_len_path, curr_seq_len_path = None, None if (sin_path == sin_path_1 and cos_path == cos_path_1) or ( sin_path == sin_path_3 and cos_path == cos_path_3 ): if sin_path[-2].name != cos_path[-2].name or sin_path[-1].name != cos_path[-1].name: logger.debug( "fuse_rotary_embeddings: failed to match common Gather node and Shape node in sin cache and cos cache" ) return elif (sin_path == sin_path_2 and cos_path == cos_path_2) or ( sin_path == sin_path_4 and cos_path == cos_path_4 ): if sin_path[-1].name != cos_path[-1].name: logger.debug("fuse_rotary_embeddings: failed to match common Add node in sin cache and cos cache") return # Match past sequence length path: past_key --> Shape --> Gather --> Add past_seq_len_path = self.model.match_parent_path( sin_path[-1], ["Gather", "Shape"], [1, 0], ) # Match current sequence length path: transpose_k --> Shape --> Gather --> Add curr_seq_len_path = self.model.match_parent_path( sin_path[-1], ["Gather", "Shape", "Transpose"], [0, 0, 0], ) if ( past_seq_len_path is None or curr_seq_len_path is None or self.model.find_graph_input(past_seq_len_path[-1].input[0]) is None or curr_seq_len_path[-1].op_type != "Transpose" ): logger.debug("fuse_rotary_embeddings: failed to match past_seq_len and curr_seq_len paths") return else: logger.debug("fuse_rotary_embeddings: failed to match common cache paths") rotary_emb_node = self.create_rotary_embeddings_from_nodes( rotate_half_x1_path_1[-1].output[0], position_ids, cos_cache, sin_cache, node.output[0], ) if rotary_emb_node is None: logger.debug("fuse_rotary_embeddings: failed to create RotaryEmbedding node") return # Remove rotary embedding nodes self.add_nodes_to_remove([node]) self.add_nodes_to_remove(rotate_half_x1_path_1[:-1]) self.add_nodes_to_remove(rotate_half_x1_path_2[:-1]) self.add_nodes_to_remove(rotate_half_x2_path_1[:-1]) self.add_nodes_to_remove(rotate_half_x2_path_2[:-1]) self.add_nodes_to_remove(x_path[:-1]) self.add_nodes_to_remove(sin_path) self.add_nodes_to_remove(cos_path) self.add_nodes_to_remove(position_ids_from_sin_path[:-1]) self.add_nodes_to_remove(position_ids_from_cos_path[:-1]) if past_seq_len_path is not None and len(self.model.get_children(past_seq_len_path[0])) == 1: # In merged HF model, output of Gather in past_seq_len_path is used twice # for past_key_values.0.key and once for other past_key_values self.add_nodes_to_remove(past_seq_len_path) if curr_seq_len_path is not None: self.add_nodes_to_remove(curr_seq_len_path[:-1]) self.increase_counter(self.base_name) self.node_name_to_graph_name[rotary_emb_node.name] = self.this_graph_name self.nodes_to_add.append(rotary_emb_node) self.prune_graph = True