� %�g�h�� ddlZddlZddlZddlZddlZddlZddlZddlZddlZddl Z ddl Z ddlmZm Z ddlmZmZddlZddlmZddlmZddlmZmZddlmZmZmZdd lmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&dd l'm(Z(m)Z)ddl*m+Z+m,Z,m-Z-ddl.m/Z/m.Z.dd l0m1Z1m2Z2e"d��rddl3Z3ed��rddl4Z4e$d��rddl5Z5e!d��rddl6Z6ddl7m8Z8ddl9m:Z;dZ<ej=e>��Z?d�Z@d�ZAdeeBeCffd�ZDdejEfd�ZFdejGdeHejIeBeBffd�ZJ dedejKdeCdeeBeCejIfdeejGdeeeCejEfd eejLd!eeMeBeMejIejGfffd"�ZN dfdejKd$eOd%eOd&eOfd'�ZPdgdejKd%eOfd(�ZQGd)�d*eR��ZSd+ejKfd,�ZTd-�ZUd.�ZVd+ejjKfd/�ZWd0�ZXdeeCejIfdejEfd1�ZY dhd+ejKdeeeCejIfd2eeMeCeeCejIffd3eOfd4�ZZ did+ejKdeeeCejIfd2eeMeCeeCejIfffd5�Z[d6eReHeCejjKfd7eMeCeBfd8eReCfd9�Z\djd:eeMeeBeCfeeBeCfffd;�Z]dkd=eMeCeeBeCejIffd>eCfd?�Z^d@�Z_dA�Z` dld+ejKd:eeMeeBeCfeeBeCffd8eeReCdeeeCejEfd2eeMeCeeCejIffdBeOfdC�Zad+ejjKfdD�Zb ded+ejKd:eeMeeBeCfeeBeCffd8eeReCdeeeCejEfd2eeMeCeeCejIffdeHeReeBeCfeMeeBeCfeeBeCffeReeBeCfeReBeMeCeBfeReReCeReCeReHeCejKfffdE�ZcdeHeBeReCeRejKffdF�Zd did6eReHeCejKfd7eMeCeBfdGeeBeCfd8eeReCdHeeReReCdeHeeCeejKeReHeCejKfffdI�Ze dmd+ejKd:eeMeeBeCfeeBeCffd8eeReCdeeeCejEfd2eeMeCeeCejEffdJeOdKeOdLeOdMeOfdN�Zfd+ejKd=eMeCeeBeCejIfffdO�ZgdjdP�Zhd+ejKfdQ�Zi dndejKd>eCdSeMeCeeCejjffdTeeBeCejIffdU�Zk dod+ejKdVeeCejlfd=eeMeCeeBeCejIffdWeeeCejlfdeeeCejEfdXeOdLeOdYeReCdZeOd[eOfd\�Zmdpd]eOd^efd_�Zndjd`efda�ZodejjKdeOfdb�ZpejqdjdejjKdceejIfdd��ZrdS)q�N)�OrderedDict�defaultdict)�Optional�Union�)�AcceleratorState�)�SAFE_WEIGHTS_NAME�WEIGHTS_NAME)�AutocastKwargs�CustomDtype�DistributedType) �is_hpu_available�is_mlu_available�is_mps_available�is_musa_available�is_npu_available�is_peft_available�is_sdaa_available�is_torch_xla_available�is_xpu_available)�clear_device_cache�get_xpu_available_memory)�load_offloaded_weight�offload_weight�save_offload_index)�is_tqdm_available�tqdm)�compare_versions�is_torch_versionF)�check_device)� safe_open)� load_filezpytorch_model.bin.index.jsonc��ddlm}t��rddlm}t��ot||��|��S)Nr )�extract_model_from_parallelr)� PeftModel)�otherr%r�peftr&� isinstance)�modelr%r&s �i/home/asafur/pinokio/api/open-webui.git/app/env/lib/python3.11/site-packages/accelerate/utils/modeling.py� is_peft_modelr,Is\��2�2�2�2�2�2��#�"�"�"�"�"�"��\�:�.I�.I�%�.P�.P�R[�#\�#\�\�c��|j|jkrdS|jdkr"|j�tj|jd��}|jdkr"|j�tj|jd��}||kS)ar Utility method to check if two `torch` devices are similar. When dealing with CUDA devices, torch throws `False` for `torch.device("cuda") == torch.device("cuda:0")` whereas they should be the same Args: first_device (`torch.device`): First device to check second_device (`torch.device`): Second device to check F�cpuNr��index)�typer1�torch�device)�first_device� second_devices r+�check_device_samer7Rs��M�.�.�.��u��E�!�!�l�&8�&@��|�L�$5�Q�?�?�?��U�"�"�}�':�'B��]�%7�q�A�A�A� ��=�(�(r-�sizec��d}d|�d�} t|t��r|}�n0|��d��r)tt |dd��dz��}�n�|��d��r)tt |dd��d z��}�n�|��d ��r)tt |dd��dz��}�n@|��d��rDtt |dd ��dz��}|�d��r|dzn|}n�|��d��rDtt |dd ��dz��}|�d��r|dzn|}nj|��d��rCtt |dd ��dz��}|�d��r|dzn|}n#t $rt|��wxYw|dkrt|��|S)a( Converts a size expressed as a string with digits an unit (like `"5MB"`) to an integer (in bytes). Args: size (`int` or `str`): The size to convert. Will be directly returned if an `int`. Example: ```py >>> convert_file_size_to_int("1MiB") 1048576 ``` ��z`size` z] is not in a valid format. Use an integer for bytes, or a string with an unit (like '5.0GB').�GIBN��i@�MIBi�KIBi�GB��iʚ;�b��MBi@B�KBi�r)r)�int�upper�endswith�float� ValueError)r8�mem_size�err_msg�int_sizes r+�convert_file_size_to_intrMmsm��H�u�$�u�u�u��"��d�C� � � I��H�H� �Z�Z�\�\� "� "�5� )� )� I��5��c�r�c��+�+�u�5�6�6�H�H� �Z�Z�\�\� "� "�5� )� )� I��5��c�r�c��+�+�u�5�6�6�H�H� �Z�Z�\�\� "� "�5� )� )� I��5��c�r�c��+�+�u�5�6�6�H�H� �Z�Z�\�\� "� "�4� (� (� I��5��c�r�c��+�+�u�5�6�6�H�(,� � �c�(:�(:�H�x�1�}�}��H�H� �Z�Z�\�\� "� "�4� (� (� I��5��c�r�c��+�+�u�5�6�6�H�(,� � �c�(:�(:�H�x�1�}�}��H�H� �Z�Z�\�\� "� "�4� (� (� I��5��c�r�c��+�+�u�5�6�6�H�(,� � �c�(:�(:�H�x�1�}�}��H��"�"�"��!�!�!�"��!�|�|��!�!�!��Os�I I�I.�dtypec��|tjkrdS|tjkrdS|tjkrdS|tjkrdSt dd��r|tjkrdStj dt|��}|�td |�d ��t|� ��d��}|dzS) z� Returns the size (in bytes) occupied by one parameter of type `dtype`. Example: ```py >>> dtype_byte_size(torch.float32) 4 ``` g�?g�?g�?r �>=z2.1.0z[^\d](\d+)$Nz`dtype` is not a valid dtype: �.rrB)r3�boolr �INT2�INT4�FP8r � float8_e4m3fn�re�search�strrIrE�groups)rN� bit_search�bit_sizes r+�dtype_byte_sizer]�s�� u� �+�"� "� "��u� �+�"� "� "��u� �+�/� !� !��q� �$�� (� (��U�e�6I�-I�-I��q��>�3�u�:�:�6�6�J��B�%�B�B�B�C�C�C��:�$�$�&�&�q�)�*�*�H��q�=�r-�tensor�returnc��tjdtjdtjdtjdtjdtjdtjdtjdtj dtj di } |��}|�� }n�#t$r� |��}|��||jz}n4#t$$r'd}|��||jz}YnwxYwYnwxYw|j||fS)a� Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with non-overlapping lifetimes may have the same id. rB�rr r)r3�int64�float32�int32�bfloat16�float16�int16�uint8�int8rR�float64�untyped_storage�data_ptr�nbytes� Exception�storager8rN�NotImplementedError�nelementr4)r^�_SIZE�storage_ptr�storage_sizes r+�id_tensor_storageru�sR�� Q� � �q� ��Q� �� q� ��Q� ��Q� � �A� � �A� � �q� �E�C��,�,�.�.�7�7�9�9��-�-�/�/�6�6�8�8�� C� C� C� C� �.�.�*�*�3�3�5�5�K�!�>�>�+�+�0�0�2�2�U�6�<�5H�H�L�L��"� C� C� C��K�!�?�?�,�,�u�V�\�/B�B�L�L�L� C�� C��=�+�|�3�3s8�<AC � E&�AD/�.E&�/.E �E&�E � E&�%E&�module�tensor_namer4�value�fp16_statistics�tied_params_mapc ��d|vrS|�d��}|dd�D]+}t||��} | �t|�d|�d��| }�,|d}||jvr||jvrt|�d|�d��||jv} t||��}|�^|�\|��|vrF|||��vr*||��||j|<dS|�\|��|vrF|||��vr*||��||j|<dS|jtjd��kr/|dtjd��fvr|�t|�d|�d��||jvr |j|nd}t|��} |��|j |j kr.| j dkr#td |j �d |�d|j �d��|�|�|j��}n<t|j��d ��s|�|��}d}tj��5|�:|jjdkr*tj|��jdkr | j dvr|}d}t!|t"��r`t%��rd|��}nLt'��rd|��}n8t)��rd|��}n$t+��rd|��}nt-��rd}dt|��vr t/��st|�d��|��|�|��}|�p|dtjd��fvrXt|j��d ��s|�|��}| s| ||j��|j|<nFt!|tj��r|�|��}ntj||��}|�|}| r||j|<�n-|�3t7tj|��|j|j��s�t|j|��} |j|j}| j dvr�| j dkr4|jtjkr|�tj��}|dkr�| j dkrv| |fd|ji|��d��d��}|j�d��|_|j �d��|_ �n7| |fd|ji|��|��}�n| j dvr:tj!�"||j��|��}n�| j dvr�tFj$�%d��tMdd d!��r |j'f}n|j(f}||j)|j |j*|j+|j,fz }tj!�"| |�|j��|��}n%| ||j��|��}||j|<|�&|�|��|j|_ ~|j-j d"kr�t|j.d#d��t|j.j��dkr�tj|��jdkrtj|��j/nd}t|j.d#d��sP|�N|j0�+|j0jjdkr|�1|��}n�|j0�|�1|��}n�|j-j d$kr�t|j.d%d��t|j.j��dkrotj|��jdkrtj|��j/nd}t|j.d%d��s!|�|j.�1|��|_.ddd��n#1swxYwY|dkrte��|�Q|��|vr;|||��vr|||��|<dS|�S|�S|��|vr?|||��vr%|||��|<dSdSdSdSdS)&a� A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). Args: module (`torch.nn.Module`): The module in which the tensor we want to move lives. tensor_name (`str`): The full name of the parameter/buffer. device (`int`, `str` or `torch.device`): The device on which to set the tensor. value (`torch.Tensor`, *optional*): The value of the tensor (useful when going from the meta device to any other device). dtype (`torch.dtype`, *optional*): If passed along the value of the parameter will be cast to this `dtype`. Otherwise, `value` will be cast to the dtype of the existing parameter in the model. fp16_statistics (`torch.HalfTensor`, *optional*): The list of fp16 statistics to set on the module, used for 8 bit model serialization. tied_params_map (Dict[int, Dict[torch.device, torch.Tensor]], *optional*, defaults to `None`): A map of current data pointers to dictionaries of devices to already dispatched tied weights. For a given execution device, this parameter is useful to reuse the first available pointer of a shared weight on the device for all others, instead of duplicating memory. rQNr:z has no attribute z- does not have a parameter or a buffer named �metaz7 is on the meta device, we need a `value` to put in on � Params4bitz Trying to set a tensor of shape z in "z" (which has shape z), this looks incorrect.�z torch.uintz torch.intz torch.bool�cuda)� Int8Params� FP4Paramsr}r/�npu:zmlu:zsdaa:zmusa:�hpu�xpuz6 is not available, you should use device="cpu" instead)� requires_grad�r4r�r�r)�QTensor�QBitsTensor)�AffineQuantizedTensor�torchaorPz0.7.0�Linear8bitLt�SCB� Linear4bit�quant_state)3�split�getattrrI�_parameters�_buffersrlr4r3r2�shape�__name__�torNrY� startswith�no_gradr)rErrrrrrr��Tensorr^r7�__dict__rcrf�CBr��nn� Parameter� importlib�util� find_specr�tensor_impl� layout_tensor� block_size� quant_min� quant_max�zero_point_domain� __class__�weightr1�biasrr)rvrwr4rxrNryrz�splitsr�� new_module� is_buffer� old_value�param� param_cls�device_quantization� new_value�kwargs�args�device_indexs r+�set_module_tensor_to_devicer��sR ��B�k��"�"�3�'�'��C�R�C�[� � �E� ��/�/�J��!� �F�!F�!F�e�!F�!F�!F�G�G�G��F�F��R�j��&�,�,�,��F�O�1S�1S��F�_�_�Q\�_�_�_�`�`�`��v��.�I��,�,�I� ��'��N�N��/�/��o�e�n�n�&6�&6�7�7�7�*9�%�.�.�:J�:J�*K�F�*S��;�'��#�� O�3�3��o�i�&8�&8�&:�&:�;�;�;�*9�)�:L�:L�:N�:N�*O�PV�*W��;�'��5�<��/�/�/�/�F�6�5�<�X^�K_�K_�B`�4`�4`�ej�er��K�i�i�`f�i�i�i�j�j�j�/:�f�>P�/P�/P�F��{�+�+�VZ�E��U��I��?�e�k�)�)�i�.@�L�.P�.P��O�5�;�O�O�[�O�O�en�et�O�O�O�� =��H�H�Y�_�-�-�E�E��U�[�!�!�,�,�-V�W�W� $��H�H�U�O�O�E�� eE�eE� ��!�V�+�+��V�$�$�)�V�3�3��"�&O�O�O�"(��F��f�c�"�"� ��!�!� �(��!�#�#� �(��"�$�$� �)��)�)��"�$�$� �)��)�)��!�#�#� ��C��K�K��(8�(:�(:��^�^�^�_�_�_��=�!��V�,�,�I�� V��V�8L�8L�/M�%M�%M��9�?�+�+�6�6�7`�a�a�4� )��U� 3� 3�I� �r�6?�i� �Yb�Yp�6q�6q�6q�F�&�{�3�� u�|� ,� ,� ;��(�(�I�I��U�6�:�:�:�I��*�(�F��> E�+4�F�O�K�(�(� � �&7��V�8L�8L�f�N`�al�Nm�Nt�&u�&u� ��V�/��<�=�=�I��'��4�=�F��!�%N�N�N��%��5�5�)�/�U�]�:Z�:Z� )��U�]� ;� ;�I��U�?�?�y�'9�\�'I�'I� )� �)� e� e�9�CZ� e�^d� e� e� h� h�ij� k� k� n� n�ot� u� u�I�#,�<�?�?�5�#9�#9�I�L�$-�M�$4�$4�U�$;�$;�I�M�M� )� �)� e� e�9�CZ� e�^d� e� e� h� h�io� p� p�I�I��#�'A�A�A�!�H�.�.�y� �H_�.�`�`�c�c�dj�k�k� � ��#�'@�@�@��>�+�+�I�6�6�B�GW�Xa�cg�ip�Gq�Gq�B�%�1�3�D�D�%�3�5�D��(��O��'��'��/��"�H�.�.�y�y�$�/?�y�Of�.�g�g�j�j�kq�r�r� � �%�I�i�y�?V�W�W�W�Z�Z�[a�b�b� �.7�F��{�+��*�6E�6H�6H��6P�6P��"�;�/�3�#�� )�^�;�;��F�M�5�$�7�7�?�� ,�-�-��7�7�>C�\�&�=Q�=Q�=V�Z`�=`�=`�u�|�F�3�3�9�9�fj��v�}�e�T�:�:�;�|�?W��{�.�6�;�3E�3J�f�3T�3T�!'��\�!:�!:��,�!'��\�!:�!:�� )�\�9�9��F�M�=�$�?�?�G�� ,�-�-��7�7�>C�\�&�=Q�=Q�=V�Z`�=`�=`�u�|�F�3�3�9�9�fj��v�}�m�T�B�B�E�|�G_�$*�M�$6�$6�|�$D�$D�F�M�KeE�eE�eE�eE�eE�eE�eE�eE�eE�eE�eE��eE�eE�eE�eE�N�� #�� O�3�3��/�)�*<�*<�*>�*>�?�?�?�8A�� *�*�,�,�-�f�5�5�5� ��'��N�N��/�/��/�%�.�.�*:�*:�;�;�;�4=��(�(�)�&�1�1�1� ��'�'�/�/�;�;s�7W2a5�5a9�<a9T�include_buffers�recurse�remove_non_persistentc#��K�|�|��Ed{V��|rGt��}|rt||��}|�|��D]}|\}}||vr|V��dSdS)a? A helper function that gathers all the tensors (parameters + buffers) of a given module. If `include_buffers=True` it's the same as doing `module.named_parameters(recurse=recurse) + module.named_buffers(recurse=recurse)`. Args: module (`torch.nn.Module`): The module we want the tensors on. include_buffer (`bool`, *optional*, defaults to `True`): Whether or not to include the buffers in the result. recurse (`bool`, *optional`, defaults to `False`): Whether or not to go look in every submodule or just return the direct parameters and buffers. remove_non_persistent (`bool`, *optional*, defaults to `False`): Whether or not to remove the non persistent buffer from the buffers. Useful only when include_buffers = True �r�N)�named_parameters�set�get_non_persistent_buffers� named_buffers)rvr�r�r��non_persistent_buffers�named_buffer�name�_s r+�named_module_tensorsr��s��$�&�&�w�&�7�7�7�7�7�7�7�7�7��#�!$�� Y�%?��PW�%X�%X�%X�"�"�0�0��0�A�A� #� #�L�"�G�D�!��1�1�1�"�"�"�"��#�#� #� #r-c�`�|j}|r$|��D]\}}||jz}�|S)aV Gather all non persistent buffers of a given modules into a set Args: module (`nn.Module`): The module we want the non persistent buffers on. recurse (`bool`, *optional*, defaults to `False`): Whether or not to go look in every submodule or just return the direct non persistent buffers. )�_non_persistent_buffers_set� named_modules)rvr��non_persistent_buffers_setr��ms r+r�r��sN��"(�!C��H��(�(�*�*� H� H�D�A�q�&�!�*G�G�&�&�%�%r-c�(��eZdZdZ�fd�Zd�Z�xZS)�FindTiedParametersResultz� This is a subclass of a list to handle backward compatibility for Transformers. Do not rely on the fact this is not a list or on the `values` method as in the future this will be removed. c�:��t��j|i|��dS�N)�super�__init__)�selfr�r�r�s �r+r�z!FindTiedParametersResult.__init__�s%��$�)�&�)�)�)�)�)r-c�j�tjdt��td�|D��g��S)NzhThe 'values' method of FindTiedParametersResult is deprecated and will be removed in Accelerate v1.3.0. c�"�g|]}|dd�� S)r N�)�.0�xs r+� <listcomp>z3FindTiedParametersResult.values.<locals>.<listcomp>�s ��(�(�(�a�A�a�b�b�E�(�(�(r-)�warnings�warn� FutureWarning�sum)r�s r+�valueszFindTiedParametersResult.values�s=�� v�� (�(�4�(�(�(�"�-�-�-r-)r�� __module__�__qualname__�__doc__r�r�� __classcell__)r�s@r+r�r��sQ�� *�*�*�*�*�.�.�.�.�.�.�.r-r�r*c��d}d}d}dd�tj|j��D��vr�t|d��o)t |jdd��o|��}t|d��o+t |jdd��ot |jdd��}td�|��D��}t|||g��S) z� Check if there is any indication in the given model that some weights should be tied. Args: model (`torch.nn.Module`): The model to inspect Returns: bool: True if the model needs to have tied weights F�PreTrainedModelc��g|] }|j�� Sr�)r�)r��cs r+r�z3check_tied_parameters_in_config.<locals>.<listcomp>�s��Q�Q�Q�A�Q�Z�Q�Q�Qr-�config�tie_word_embeddings�is_encoder_decoder�tie_encoder_decoderc3�6K�|]}t|d��V��dS)�_tie_weightsN)�hasattr)r�rvs r+� <genexpr>z2check_tied_parameters_in_config.<locals>.<genexpr>s,��\�\�&�g�f�n�=�=�\�\�\�\�\�\r-) �inspect�getmror�r�r�r��get_output_embeddings�any�modules)r*�has_tied_word_embedding�has_tied_encoder_decoder�has_tied_modules r+�check_tied_parameters_in_configr��s��$��$��O��Q�Q��1P�1P�Q�Q�Q�Q�Q��E�8�$�$� .��&;�U�C�C� .��+�+�-�-� � �E�8�$�$� D��&:�E�B�B� D��&;�U�C�C� !� �\�\�E�M�M�O�O�\�\�\�\�\��'�)A�?�S�T�T�Tr-c��||vr||Sd�|�d��dd��}||krtd|�d��t||��S)NrQr:z-The `device_map` does not contain the module )�joinr�rI�_get_param_device)r�� device_map�parent_params r+r�r�ss�� %� � ��8�8�E�K�K��,�,�S�b�S�1�2�2�L��u��Q��Q�Q�Q�R�R�R� ��z�:�:�:r-c��|D]l}i}|D]}t||��||<�tt|��dkrt�d|�d��mdS)a9 Check if tied parameters are on the same device Args: tied_params (`List[List[str]]`): A list of lists of parameter names being all tied together. device_map (`Dict[str, Union[int, str, torch.device]]`): A map that specifies where each submodule should go. r z*Tied parameters are on different devices: zC. Please modify your custom device map or set `device_map='auto'`. N)r��lenr�r��loggerr�)�tied_paramsr�� tie_param�tie_param_devicesr�s r+�$check_tied_parameters_on_same_devicer�s��!�� L� L�E�'8�� 'K�'K��e�$�$��s�$�+�+�-�-�.�.�/�/�!�3�3��K�K�T�=N�T�T�T� � � ��r-c��d�|�d��D��}d�|�d��D��}t|��t|��z }i}|D]L}||}|��D]-\}} | |ur$||vrg||<||�|��.�Mtd�|��D��S)a� Find the tied parameters in a given model. <Tip warning={true}> The signature accepts keyword arguments, but they are for the recursive part of this function and you should ignore them. </Tip> Args: model (`torch.nn.Module`): The model to inspect. Returns: List[List[str]]: A list of lists of parameter names being all tied together. Example: ```py >>> from collections import OrderedDict >>> import torch.nn as nn >>> model = nn.Sequential(OrderedDict([("linear1", nn.Linear(4, 4)), ("linear2", nn.Linear(4, 4))])) >>> model.linear2.weight = model.linear1.weight >>> find_tied_parameters(model) [['linear1.weight', 'linear2.weight']] ``` c��i|]\}}||�� Sr�r��r�r�r�s r+� <dictcomp>z(find_tied_parameters.<locals>.<dictcomp>Ls��j�j�j�K�D�%�D�%�j�j�jr-F)�remove_duplicatec��i|]\}}||�� Sr�r�rs r+rz(find_tied_parameters.<locals>.<dictcomp>Ps��$r�$r�$r�[�T�5�T�5�$r�$r�$rr-Tc �n�g|]2\}}t|gtt|��z��3Sr�)�sorted�listr�)r�r��tieds r+r�z(find_tied_parameters.<locals>.<listcomp>as8��$v�$v�$v�L�F�TX�V�V�H�t�C��I�I��,F�%G�%G�$v�$v�$vr-)r�r��keys�items�appendr�) r*r��all_named_parameters�no_duplicate_named_parameters�tied_param_names�tied_param_groups�tied_param_name� tied_param� param_namer�s r+�find_tied_parametersr-s>��>k�j�5�;Q�;Q�ch�;Q�;i�;i�j�j�j��%s�$r�E�DZ�DZ�lp�DZ�Dq�Dq�$r�$r�$r�!��/�4�4�6�6�7�7�#�>[�>`�>`�>b�>b�:c�:c�c��+�F�F��)�/�:� �!>�!D�!D�!F�!F� F� F��J�� "�"��%6�6�6�46�%�j�1�!�*�-�4�4�_�E�E�E�� F�$�$v�$v�\m�\s�\s�\u�\u�$v�$v�$v�w�w�wr-c��|D]�}d}|D]o}|}|�d��}|dd�D]}t||��}�t||d��}|�!|jtjd��kr|}n�p|�P|D]M}|}|�d��}|dd�D]}t||��}�t ||d|��N��dS)aX Reties tied parameters in a given model if the link was broken (for instance when adding hooks). Args: model (`torch.nn.Module`): The model in which to retie parameters. tied_params (`List[List[str]]`): A mapping parameter name to tied parameter name as obtained by `find_tied_parameters`. NrQr:r|)r�r�r4r3�setattr) r*r�� tied_group�param_to_tierrvr�r�r�s r+�retie_parametersrds��"�:�:� ��$� � �J��F��%�%�c�*�*�F�� 0� 0�� /�/��F�F�2�J�/�/�E��#��V�8L�8L�(L�(L�$��#�(� :� :� ��#�)�)�#�.�.��#�C�R�C�[�4�4�E�$�V�U�3�3�F�F��r� �L�9�9�9�9��%:�:r-c��t|t��r+|�dd��}tt|��}|S)z4 Just does torch.dtype(dtype) if necessary. �torch.�)r)rY�replacer�r3�rNs r+�_get_proper_dtyper�s=��%��&�� h��+�+��u�%�%��Lr-�special_dtypes�buffers_onlyc��|�t|��}t|��}|�<d�|��D��}d�|��D��}tt��}g}|st|d��}n|�d��}|D�]@\}} |�"||vr| ��||z} n�|�*| ��t| j��z} n�t| j�� d��r*| ��t| j��z} n7| ��t|t| j��z} |�d��}tt|��dz��D]-}|d�|d|��xx| z cc<�.��B|S) z> Compute the size of each submodule of a given model. Nc�4�i|]\}}|t|��Sr�)r�r��key�dtyps r+rz(compute_module_sizes.<locals>.<dictcomp>�s'��_�_�_�9�3��#�0��6�6�_�_�_r-c�4�i|]\}}|t|��Sr�)r]r#s r+rz(compute_module_sizes.<locals>.<dictcomp>�s&��b�b�b�i�c�4�s�O�D�$9�$9�b�b�br-Tr�r~rQr )rr]r rrEr�r��numelrNrYr��minr��ranger�r�) r*rNrr � dtype_size�special_dtypes_size�module_sizes�module_listr�r^r8� name_parts�idxs r+�compute_module_sizesr0�s�� !�%�(�(��$�U�+�+� ��!�_�_��H\�H\�H^�H^�_�_�_��b�b�>�K_�K_�Ka�Ka�b�b�b��s�#�#�L��K��8�*�5�$�?�?�?��)�)�$�)�7�7��#� =� =��f��%�$�.�*@�*@��<�<�>�>�$7��$=�=�D�D� �]��<�<�>�>�O�F�L�$A�$A�A�D�D� �� )� )�*S� T� T� S��<�<�>�>�O�F�L�$A�$A�A�D�D��<�<�>�>�C� �O�F�L�4Q�4Q�$R�$R�R�D��Z�Z��_�_� ��Z��1�,�-�-� =� =�C��*�T�c�T�"2�3�3�4�4�4��<�4�4�4�4� =��r-c�T�t|||d��}|�dd��S)zO Compute the total size of buffers in each submodule of a given model. T)rNrr rr)r0�get)r*rNrr,s r+� compute_module_total_buffer_sizer3�s1��(��U�>�hl�m�m�m�L��B��"�"�"r-r�r,�no_split_module_classesc�� d}g}|��}t|��dkr�|�d��\� }t|tjj��r!t|��ng}t|��dks|j j |vr0|� }||kr|}� g}n-||kr|�� n� fd�|D��|z}t|��dk��||fS)aO Utility function that will scan a list of named modules and return the maximum size used by one full layer. The definition of a layer being: - a module with no direct children (just parameters and buffers) - a module whose class name is in the list `no_split_module_classes` Args: modules (`List[Tuple[str, torch.nn.Module]]`): The list of named modules where we want to determine the maximum layer size. module_sizes (`Dict[str, int]`): A dictionary mapping each layer name to its size (as generated by `compute_module_sizes`). no_split_module_classes (`List[str]`): A list of class names for layers we don't want to be split. Returns: `Tuple[int, List[str]]`: The maximum size of a layer with the list of layer names realizing that maximum size. rc�(��g|]\}}��d|��|f��S�rQr�)r��n�v�module_names �r+r�z&get_max_layer_size.<locals>.<listcomp>�s0��W�W�W�d�a��K�!5�!5�!�!5�!5�q� 9�W�W�Wr-)�copyr��popr)r3r��Moduler�named_childrenr�r�r) r�r,r4�max_size�layer_names�modules_to_treatrv�modules_childrenr8r:s @r+�get_max_layer_sizerC�s&��(�H��K��|�|�~�~�� !� #� #�.�2�2�1�5�5��V�<F�v�u�x��<_�<_�g�4�� 5� 5� 7� 7�8�8�8�eg�� A�%�%��)9�)B�F]�)]�)]��,�D��h��*�m��!�!��"�"�;�/�/�/��W�W�W�W�FV�W�W�W�Zj�j�� !� #� #��[� � r-� max_memoryc��ddl}��Di�t��r�ttj��D]�} tjdtjd|��}tj�|��d�|<�U#t$r!t�d|�d��Y�wxYw�n;t��r�ttj ��D]�} tjdtjd|��}tj �|��d�|<�U#t$r!t�d|�d��Y�wxYw�n}t��r�ttj��D]�} tjdtjd|��}tj�|��d�|<�U#t$r!t�d|�d��Y�wxYw�n�t!��r�ttj��D]�} tjdtjd |��}tj�|��d�|<�U#t$r!t�d|�d��Y�wxYw�nt%��r�ttj��D]l} tjdtjd |��}t)|��|<�?#t$r!t�d|�d��Y�iwxYw�nYt+��r�ttj��D]�} tjdtjd|��}tj�|��d�|<�U#t$r!t�d|�d��Y�wxYwn�ttj��D]p} tjdg|��}tj�|��d�|<�C#t$r!t�d|�d��Y�mwxYwt1��r|��j�d<n|��j�d <�S�D]5}t7�|t8��rt;�|��|<�6d��D��}|��t��rtj��}n�t��rtj ��}n�t��rtj��}n�t!��rtj��}nxt%��rtj��}nKt+��rtj��}ntj��}|D]H}||ks|dkr:t� d|�dtCt|��I|�fd�dD��z}��D]} | |vrtEd| �d��fd�|D��S)zb Get the maximum memory available if nothing is passed, converts string to int otherwise. rN�npur�zDevice z; seems unavailable, Proceeding to check subsequent devices.�mlu�sdaa�musar�r��mpsr/c�<�g|]}t|t��|��Sr��r)rE)r��ks r+r�z"get_max_memory.<locals>.<listcomp>9s'��F�F�F��:�a��3E�3E�F�1�F�F�Fr-z) is not available, available devices are c�@��g|]}|��v�|��Sr�)r �r�rMrDs �r+r�z"get_max_memory.<locals>.<listcomp>Ns-�� ]� ]� ]�q�a�:�?�?�K\�K\�F\�F\��F\�F\�F\r-)rJr/�diskzX is not recognized, available devices are integers(for GPU/XPU), 'mps', 'cpu' and 'disk'c�"��i|]}|�|��Sr�r�rOs �r+rz"get_max_memory.<locals>.<dictcomp>Us��8�8�8�q�!�Z��]�8�8�8r-)#�psutilrr)r3rF�device_countr^r4�mem_get_infornr��inforrGrrHrrIrr�rrr�rr�virtual_memory� availabler)rYrMr �sort�warningrrI) rDrR�ir�r$�gpu_devices�num_devicesr4�all_devicesrMs ` r+�get_max_memoryr^�s��M�M�M�� 7 ��5�9�1�1�3�3�4�4� � ��Q�u�|�E�1�/E�/E�F�F�F�A�$)�I�$:�$:�1�$=�$=�a�$@�J�q�M�M�� K�K� h�!� h� h� h�i�i�i��H�� / ��5�9�1�1�3�3�4�4� � ��Q�u�|�E�1�/E�/E�F�F�F�A�$)�I�$:�$:�1�$=�$=�a�$@�J�q�M�M�� K�K� h�!� h� h� h�i�i�i��H�� ' ��5�:�2�2�4�4�5�5� � ��Q�u�|�F�A�/F�/F�G�G�G�A�$)�J�$;�$;�A�$>�$>�q�$A�J�q�M�M�� K�K� h�!� h� h� h�i�i�i��H�� 5�:�2�2�4�4�5�5� � ��Q�u�|�F�A�/F�/F�G�G�G�A�$)�J�$;�$;�A�$>�$>�q�$A�J�q�M�M�� K�K� h�!� h� h� h�i�i�i��H�� 5�9�1�1�3�3�4�4� � ��Q�u�|�E�1�/E�/E�F�F�F�A�$<�Q�$?�$?�J�q�M�M�� K�K� h�!� h� h� h�i�i�i��H�� 5�9�1�1�3�3�4�4� � ��Q�u�|�E�1�/E�/E�F�F�F�A�$)�I�$:�$:�1�$=�$=�a�$@�J�q�M�M�� K�K� h�!� h� h� h�i�i�i��H�� 5�:�2�2�4�4�5�5� � ��a�S��3�3�3�A�$)�J�$;�$;�A�$>�$>�q�$A�J�q�M�M�� K�K� h�!� h� h� h�i�i�i��H�� B� &� 5� 5� 7� 7� A�J�u�� &� 5� 5� 7� 7� A�J�u��H�H��j��o�s�+�+� H�6�z�#��G�G�J�s�O��G�F�j�o�o�/�/�F�F�F�K�� 0��i�,�,�.�.�� 0��i�,�,�.�.�� 0��j�-�-�/�/�� 0��j�-�-�/�/�� 0��i�,�,�.�.�� 0��i�,�,�.�.��j�-�-�/�/��r�r��[� � �F�Q�J�J��N�N�p�V�p�p�VZ�[`�al�[m�[m�Vn�Vn�p�p�q�q�q�� ]� ]� ]� ]�,B� ]� ]� ]�]�K� �_�_� � ��K��u�!�u�u�u�� 9�8�8�8�K�8�8�8�J��s��AB�(C�C�AE�(F�F�AH�(I�?I�AK�(K>�=K>�?;M;�;(N&�%N&�'AP9�9(Q$�#Q$�?S�(T�Trr�r:c�"��dkrdn��d��fd�|��D��}tt|��dkr2t|��dkr�fd�|D��D]}||=�|d|�<��fd�|��D��}t��dkr%t��d��dznd�t�fd�|D��}|D]}t||� ��|S) z] Cleans a device_map by grouping all submodules that go on the same device together. rrQc�D��g|]\}}|��|��Sr��r�)r�rMr9�prefixs �r+r�z$clean_device_map.<locals>.<listcomp>`s.�� G� G� G�D�A�q�!�,�,�v�2F�2F� G�a� G� G� Gr-r c�>��g|]}|��|��Sr�ra)r�rMrbs �r+r�z$clean_device_map.<locals>.<listcomp>bs*��@�@�@��1�<�<��+?�+?�@�!�@�@�@r-rc�~��g|]9}|��t|��t��k�7|��:Sr�)r�r�)r�rMr:rbs ��r+r�z$clean_device_map.<locals>.<listcomp>gsH��k�k�k�a��V�8L�8L�k�QT�UV�QW�QW�Z]�^i�Zj�Zj�Qj�Qj��Qj�Qj�Qjr-c3�x�K�|]4}d�|�d��d��V��5dS�rQN)r�r�)r�rMr/s �r+r�z#clean_device_map.<locals>.<genexpr>isB��R�R�A�3�8�8�A�G�G�C�L�L��#��$6�7�7�R�R�R�R�R�Rr-)r:)r r�r�r r��clean_device_map)r�r:r�rM�children_modules�childr/rbs ` @@r+rgrgZsY�� "�$�$�R�R�[�*;�*;�*;�F� G� G� G� G�J�,�,�.�.� G� G� G�F� �3�v�;�;��1��V��q��@�@�@�@�Z�@�@�@� � �A��1� � �"(��)� �;��l�k�k�k�k�:�?�?�#4�#4�k�k�k��-0��-=�-=��-A�-A�#�k��$�$� %� %�� )� )�q�C��R�R�R�R�AQ�R�R�R�R�R��!�8�8��7�7�7�7�7��r-c��|�t|��dkrdS|��D]�\}}d|vr� d}d|vrw|�dd��|��vrM|�dd��}t t j�||�d��||��}t j�||�d��}t ||��}t||d||��dS)a� Loads the weights from the offload folder into the model. Args: model (`torch.nn.Module`): The model to load the weights into. index (`dict`): A dictionary containing the parameter name and its metadata for each parameter that was offloaded from the model. offload_folder (`str`): The folder where the offloaded weights are stored. Nrr�r�z.datr/)rxry) r�r rr r�os�pathr�r�) r*r1�offload_folderr�metadatary�weight_name�tensor_filer�s r+�load_offloaded_weightsrqps�� }��E� � �a�� %�� m�m�� H��J��z�!�!�j�&8�&8��5�&I�&I�U�Z�Z�\�\�&Y�&Y�$�,�,�X�u�=�=�K�3��^��-A�-A�-A�B�B�E�+�DV��O��g�l�l�>�j�3F�3F�3F�G�G��&�{�H�=�=��#�E�:�u�F�\k�l�l�l�l�l�m�mr-c��i�|D]E}|dksd|vr� |�dd��d}��|d��dz�|<�F�fd�|D��}|S)NrrQr rc�T��g|]$}��|d��dk�|dk�"|��%S)rr)r2)r�rv�module_childrens �r+r�z%get_module_leaves.<locals>.<listcomp>�s@�� h� h� h��?�3F�3F�v�q�3Q�3Q�UV�3V�3V�[a�eg�[g�[g�f�[g�[g�[gr-)�rsplitr2)r,rv�parent�leavesrts @r+�get_module_leavesrx�s��O��E�E��R�<�<�3�f�,�,��s�A�&�&�q�)��"1�"5�"5�f�a�"@�"@�1�"D�� h� h� h� h�<� h� h� h�F��Mr-�low_zeroc�z��du}t��t��rd�nht��rd�nWt��rd�nFt ��rd�n5t��rd�n$t ��rd�nt��rd�nd �t��fd ��D��}|dkr�S|dkr`d }|r\�� D]G}t|t��r0�|xxdzcc<t� d|�d��n�Ht|||��d|r|dz n|z} |�g}nt|tt f��s|g}t|��dkr�i} ��D]�\}}|dkr�|} |�d��D]}t'| |��} �| jj}||vr || vr|| |<t-| � ��t-|��krn��t| ��dkr!t/| ��nd}nd}t3��fd��D��t3��tt5�fd��D��t/t��d��z��}tdt/||��z��}| |z } tt7d��D��}|dd�D]+}t9|r|dkr�dn| �|��|<�,|rYt/d�dt5�fd�t;d|��D��z ��}t9|�d��d<�S)a� Compute a `max_memory` dictionary for [`infer_auto_device_map`] that will balance the use of each available GPU. <Tip> All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the meta device (as it would if initialized within the `init_empty_weights` context manager). </Tip> Args: model (`torch.nn.Module`): The model to analyze. max_memory (`Dict`, *optional*): A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset. Example: `max_memory={0: "1GB"}`. no_split_module_classes (`List[str]`, *optional*): A list of layer class names that should never be split across device (for instance any layer that has a residual connection). dtype (`str` or `torch.dtype`, *optional*): If provided, the weights will be converted to that type when loaded. special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*): If provided, special dtypes to consider for some specific weights (will override dtype used as default for all weights). low_zero (`bool`, *optional*): Minimizes the number of weights on GPU 0, which is convenient when it's used for other operations (like the Transformers generate function). NrFrGrHrIr�r�rJrc�f��g|]-}tj|��j�k��|dk�+|��.S�r)r3r4r2)r��d�expected_device_typerDs ��r+r�z'get_balanced_memory.<locals>.<listcomp>�sA��s�s�s�Q��Q��0D�H\�0\�0\�ak�lm�an�qr�ar�ar�q�ar�ar�arr-rr Fg��?z(We will use 90% of the memory on device z� for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).�rNrrrQc�$��i|]\}}|�v� ||�� Sr�r�)r�r8r9rws �r+rz'get_balanced_memory.<locals>.<dictcomp>s$��M�M�M�T�Q��Q�f�_�_�A�q�_�_�_r-c� ��g|] }�|��Sr�r�)r�r8r,s �r+r�z'get_balanced_memory.<locals>.<listcomp> s��;�;�;�q�<��?�;�;�;r-g�?c3�VK�|]$\}}t|t��r |dk� |V��%dS)rNrL)r�� device_id� device_mems r+r�z&get_balanced_memory.<locals>.<genexpr>sN�� /�)�Z��T]�_b�Ic�Ic� �hr�uv�hv�hv�I�hv�hv�hv�hv� � r-r:c� ��g|] }�|��Sr�r�)r�rZrDs �r+r�z'get_balanced_memory.<locals>.<listcomp>s��1_�1_�1_�A�*�Q�-�1_�1_�1_r-)r^rrrrrrrr�r r)rEr�rUr0r�tupler r�r�r�r�r��maxr�rxr�rr(r))r*rDr4rNrry�user_not_set_max_memoryr\r$�per_gpu�no_split_childrenr�r8� submodule�submodule_name� class_name�buffer�mean_leaves� gpus_idx_listr/�min_zeror~rwr,s ` @@@r+�get_balanced_memoryr��s��J)�D�0�� +�+�J��&�$�� &�$�� &�%�� &�%�� &�$�� &�$�� &�$��%��s�s�s�s�s�*�s�s�s�t�t�K��a��a��"� �!��(�(� � ��c�3�'�'��s�O�O�O�s�*�O�O�O��K�K�o�3�o�o�o��E� �(��U�>�Z�Z�Z�L��2��h�#O�;��?�?�K�P�G��&�"$�� /�$�� ?� ?�<�#:�";��"�#�#�a�'�'��&�,�,�.�.� � �J�D�$��r�z�z��I�"&�*�*�S�/�/� ?� ?��#�I�~�>�>� � �"�,�5�J��4�4�4��K\�9\�9\�04�!�*�-��$�)�)�+�+�,�,��4K�0L�0L�L�L��M�47�8I�4J�4J�Q�4N�4N��&�-�-�/�/�0�0�0�TU��|� ,� ,�F�M�M�M�M�\�%7�%7�%9�%9�M�M�M�L� �|� ,� ,�F��c�;�;�;�;�F�;�;�;�<�<�s�3�v�;�;�PQ�?R�?R�R�S�S�K� ��F�K�0�0�0� 1� 1�F��v��G�� 3=�3C�3C�3E�3E� � � � � ��M��S�b�S�!�d�d��x�Q�C�1�H�H�j��m�m�'�S]�^a�Sb�c�c� �3��5��q�,�r�*�S�1_�1_�1_�1_��q�R]�I^�I^�1_�1_�1_�-`�-`�`�a�a��H�j��m�4�4� �1� ��r-c�V�t|��}t|dd��}|�g}t|�d��t|��zt|�d��z}t |||��}|d}||fS)z:Computes the total size of the model and its largest layer�_no_split_modulesNFr�r)r0r�rr�r>r�rC)r*�sizes�no_split_modulesrA� largest_layer� total_sizes r+�calculate_maximum_sizesr�s�� '�'�E��u�&9�4�@�@�� U� #� #�E� #� 2� 2�3�3� �u�#�#�%�%� &� &� '� �u�"�"�5�"�1�1� 2� 2� 3�� '�'7��@P�Q�Q�M��r��J��}�$�$r-c��t|��}|�g}nt|ttf��s|g}t|��}d|vr|�d��d�|D��}d|vrdg}n!t |��dkr|ddg}ndg}t|||��}t|��} t|��r-t | ��dkrt�d��t|�d � ��t|� ��zt|�d � ��z} |||||| || fS)zZ Initialize variables required for computing the device map for model allocation. NrPc��g|]}|dv�|�� S))r/rPr��r�r4s r+r�z/_init_infer_auto_device_map.<locals>.<listcomp>Ls"��J�J�J�v�F�/�,I�,I�F�,I�,I�,Ir-rJrr/r�rThe model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.Fr�)r^r)rr�r rr�r0rr�r�r�r�r>r�)r*rDr4rNr�devices�gpus�main_devicesr,�tied_parametersrAs r+�_init_infer_auto_device_mapr�0s��& � �+�+�J��&�"$�� /�$�� ?� ?�<�#:�";��:�?�?�$�$�%�%�G� �W��v��J�J��J�J�J�D� ��}�}��w�� T��Q��Q��'��w��'��U�>�Z�Z�Z�L�*�5�1�1�O�&�u�-�-� �#�o�2F�2F�!�2K�2K�� A� � � � �U� #� #�E� #� 2� 2�3�3� �u�#�#�%�%� &� &� '� �u�"�"�5�"�1�1� 2� 2� 3�� r-c�n� �t|��dkr|ggfSg}g}|D]e� � fd�t|��D��d}|�||d��|�||d��f|}t||��D]\� }||||� z z }�|||fS)a: Calculate the total size of a module, including its tied parameters. Args: tied_params (`List[str]`): The list of tied parameters. module_size (`int`): The size of the module without tied parameters. module_sizes (`Dict[str, int]`): A dictionary mapping each layer name to its size. modules_to_treat (`List[Tuple[str, nn.Module]]`): The list of named modules to treat. Returns: `Tuple[int, List[str], List[nn.Module]]`: The total size of the module, the names of the tied modules, and the tied modules. r c�P��g|]"\}\}}��|dz�� |��#Sr7ra)r�rZr8r�rs �r+r�z-get_module_size_with_ties.<locals>.<listcomp>�s;��n�n�n�9�1�f�q�!�z�Od�Od�ef�il�el�Om�Om�n�Q�n�n�nr-r)r�� enumerater�zip) r��module_sizer,rA�tied_module_names�tied_modules�tied_module_index�module_size_with_ties�tied_module_namers @r+�get_module_size_with_tiesr�qs��&�;��!��B��"�"��L�!�D�D� �n�n�n�n� �:J�0K�0K�n�n�n�op�q�� !1�2C�!D�Q�!G�H�H�H��,�->�?��B�C�C�C�C�'��(+�K�9J�(K�(K�[�[�$� �$��.>�!?�,�z�BZ�!Z�Z�� "3�\�A�Ar-� size_limitr�c�� t|��}n#t$rdd|fcYSwxYw|�g}|�g}|��}d}|�r|�d��\�}�fd�|D��}t �fd�|D��g��} t| |�||��\} }}| |krd}n�t |tj��st |tj ��rgn t|��}t|��dks|jj|vr��t|�d��|z}�fd�|D��|z}|��|sdd|fSd �|D��} d �t#��D��}|D]�}�d|��| vr�| ��}||\}}t|�d��t|��z}|d|��fd�|D��z||dzd�z}d �|D��} ��| ��}|�|��\�}�||fS)a Find a module that fits in the size limit using BFS and return it with its name and the remaining modules. Args: modules (`List[Tuple[str, nn.Module]]`): The list of named modules to search in. module_sizes (`Dict[str, int]`): A dictionary mapping each layer name to its size (as generated by `compute_module_sizes`). size_limit (`Union[int, str]`): The maximum size a module can have. no_split_module_classes (`Optional[List[str]]`, *optional*): A list of class names for layers we don't want to be split. tied_parameters (`Optional[List[List[str]]`, *optional*): A list of lists of parameter names being all tied together. Returns: `Tuple[Optional[str], Optional[nn.Module], List[Tuple[str, nn.Module]]]`: A tuple containing: - The name of the module that fits within the size limit. - The module itself. - The list of remaining modules after the found module is removed. NFrc��g|]:}t�fd�|D��t�fd�|D��8|��;S)c3�,�K�|]}�dz|dzvV��dSrfr��r�rMr�s �r+r�z/fallback_allocate.<locals>.<listcomp>.<genexpr>��0��=�=�Q�4�#�:��S��(�=�=�=�=�=�=r-c3�,�K�|]}�dz|dzvV��dSrfr�r�s �r+r�z/fallback_allocate.<locals>.<listcomp>.<genexpr>��5��It�It�de�$�QT�*�XY�\_�X_�J_�It�It�It�It�It�Itr-�r��all�r�rr�s �r+r�z%fallback_allocate.<locals>.<listcomp>��v�� =�=�=�=�*�=�=�=�=�=� �GJ�It�It�It�It�is�It�It�It�Ft�Ft� �� r-c�,��g|]}�fd�|D��S)c�(��g|]}�dz|dzv�|��Sr7r��r��pr�s �r+r�z0fallback_allocate.<locals>.<listcomp>.<listcomp>��+�� A� A� A�A�t�c�z��S��'@�'@�a�'@�'@�'@r-r�r�s �r+r�z%fallback_allocate.<locals>.<listcomp>��.��f�f�f�j� A� A� A� A�� A� A� A�f�f�fr-Tr�c�(��g|]\}}��d|��|f��Sr7r��r�r8r9r�s �r+r�z%fallback_allocate.<locals>.<listcomp>�s,��M�M�M�D�A�q��m�m��m�m�Q�/�M�M�Mr-c��g|]\}}|��Sr�r��r�r8r�s r+r�z%fallback_allocate.<locals>.<listcomp>�s��+�+�+�4�1�a�Q�+�+�+r-c�$�g|] \}}|dk�|��Sr7r�)r�rZr�s r+r�z%fallback_allocate.<locals>.<listcomp>�s!��9�9�9�T�Q��S��q��r-c�(��g|]\}}��d|��|f��Sr7r�)r�r8r9�parent_names �r+r�z%fallback_allocate.<locals>.<listcomp>�s0��I�I�I��A�{�(�(�Q�(�(�!�,�I�I�Ir-r c��g|]\}}|��Sr�r�r�s r+r�z%fallback_allocate.<locals>.<listcomp>�s��3�3�3�4�1�a�Q�3�3�3r-)rMrIr;r<r�r�r)r�r�r3r�rr>r�r�r�r�r�r1)r�r,r�r4r��modules_to_search�module_foundrvrr�r�r�rB� current_names�dot_idx� dot_index�parent_module_idx� parent_modulert� target_idxr�r�s @@r+�fallback_allocater��sY��8#�-�j�9�9� � ��#�#�#��T�7�"�"�"�"�#��&�"$��L� �#b�(�,�,�Q�/�/��f� � � � �-� � � ��f�f�f�f�Te�f�f�f�hj� � ��'@��d�+�\�;L�' �' �#��q�!� !�J�.�.��L�� &�"�,�/�/� /�3=�f�e�l�3S�3S� /�B�B��f�+�+�-�-�.�.� �� A�%�%��)9�)B�F]�)]�)]�� 7� 7�� 7� F� F�G�G�JZ�Z��M�M�M�M�<L�M�M�M�Pa�a��G�#b�J�#��T�7�"�"�,�+�7�+�+�+�M�9�9�Y�t�_�_�9�9�9�G�� 4� 4� ��:�I�:�&��-�'�'� -� 3� 3�K� @� @��&�'8�9��A�}�"�=�#A�#A�%�#A�#P�#P�Q�Q�TX��,�,�.�.�U�U��O��*�*�*�+�I�I�I�I��I�I�I�J��+�a�/�1�1�2�3� � 4�3�7�3�3�3�M��$�$�T�*�*�J��;�;�z�*�*�L�D�&�� s��&�&�verbose�clean_result�offload_buffers�fallback_allocationc �,�0�1�t|||||��\} }} }}} }}t��}d}d�| D��}i}i}t|||��\}}t|��dk�r|�d��\�0}|rtd�0�d��0fd�|D��}t|��dkrtd�|D��||��\}}|�0}�0fd�| D��}|r%t|��dkrtd|��t �0fd �|D��g��}|r%t|��dkrtd |��| |}|dkr||nd}d}| || vr||z }|}t||||��\}}}|�|||z|kr�|rEd �0��} |r | d|��z } n | d|�d�z } |�| d|||z �d�z } | d|�d�z } t| ��||xx|z cc<||�0<|D]R�1�1d�|D��vr=t�1fd�t|��D��}!|�|!��||�1<�S|sHt|tj��r.t|||��}"|�|d��|"z||<��Ot|��dk�r[|||z|k�rK|r.td| |�d�0�d|�d|||z �d|�d��d}#t||��D]�\�1}$t!|$��}%t|%��dks|$jj|vr�H|rtd�1�d��t!|$�d��|%z}%�1fd�|%D��}%�1fd�t|��D��d}!�0|fg|d|!�z|%z||!d zd�z}td!�|D��||��\}}d"}#|#r��|rtd#��|||z|k�r#t|tj��st|t,j��rgn t!|��}&|r+td| |�d�0�d|||z �d$|�d� ��t|&��dks|jj|vr|rtd%��nl|rtd�0�d��t!|�d��|&z}&�0fd&�|&D��|z}td'�|D��||��\}}��||dkrS|rQ|dkrK||t1||��z }t3|||||z || ��\}'}(})|(�|'|(fg�0|fgz|)z}��P||dkr||z||<|||z||<|d z }�0|fg|z}t|��dk��d(�|��D��}|rt7|��}|�d)d��|�dd��z}*|*dkr{|syd}+|��D]5\},}-|,d)ks|,dkr�|+s!|�|,d��}.|-|*|.zkrd"}+�6t|��dkr|+st9jd*|*�d+��|rOd,�d-�|��D��}/t>� d.|/�d/��|S)0aQ Compute a device map for a given model giving priority to GPUs, then offload on CPU and finally offload to disk, such that: - we don't exceed the memory available of any of the GPU. - if offload to the CPU is needed, there is always room left on GPU 0 to put back the layer offloaded on CPU that has the largest size. - if offload to the CPU is needed,we don't exceed the RAM available on the CPU. - if offload to the disk is needed, there is always room left on the CPU to put back the layer offloaded on disk that has the largest size. <Tip> All computation is done analyzing sizes and dtypes of the model parameters. As a result, the model can be on the meta device (as it would if initialized within the `init_empty_weights` context manager). </Tip> Args: model (`torch.nn.Module`): The model to analyze. max_memory (`Dict`, *optional*): A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset. Example: `max_memory={0: "1GB"}`. no_split_module_classes (`List[str]`, *optional*): A list of layer class names that should never be split across device (for instance any layer that has a residual connection). dtype (`str` or `torch.dtype`, *optional*): If provided, the weights will be converted to that type when loaded. special_dtypes (`Dict[str, Union[str, torch.device]]`, *optional*): If provided, special dtypes to consider for some specific weights (will override dtype used as default for all weights). verbose (`bool`, *optional*, defaults to `False`): Whether or not to provide debugging statements as the function builds the device_map. clean_result (`bool`, *optional*, defaults to `True`): Clean the resulting device_map by grouping all submodules that go on the same device together. offload_buffers (`bool`, *optional*, defaults to `False`): In the layers that are offloaded on the CPU or the hard drive, whether or not to offload the buffers as well as the parameters. fallback_allocation (`bool`, *optional*, defaults to `False`): When regular allocation fails, try to allocate a module that fits in the size limit using BFS. rc��i|]}|d��Sr|r�r�s r+rz)infer_auto_device_map.<locals>.<dictcomp>Ds��:�:�:��&�!�:�:�:r-z Treating module rQc�P��g|]"}|�k�|��dz�� |��#Sr7ra)r�r8r�s �r+r�z)infer_auto_device_map.<locals>.<listcomp>Qs5��d�d�d��d��1�<�<�X\�_b�Xb�Kc�Kc��1��r-c�Z�g|](\}}t|tjj��$||f��)Sr��r)r3r�r=�r�r8r�s r+r�z)infer_auto_device_map.<locals>.<listcomp>Ts3��W�W�W�D�A�q� �1�e�h�o�8V�8V�W�!�Q��W�W�Wr-c��g|]:}t�fd�|D��t�fd�|D��8|��;S)c3�,�K�|]}�dz|dzvV��dSrfr�r�s �r+r�z3infer_auto_device_map.<locals>.<listcomp>.<genexpr>cr�r-c3�,�K�|]}�dz|dzvV��dSrfr�r�s �r+r�z3infer_auto_device_map.<locals>.<listcomp>.<genexpr>cr�r-r�r�s �r+r�z)infer_auto_device_map.<locals>.<listcomp>`r�r-z' Found the relevant tied param groups c�,��g|]}�fd�|D��S)c�(��g|]}�dz|dzv�|��Sr7r�r�s �r+r�z4infer_auto_device_map.<locals>.<listcomp>.<listcomp>kr�r-r�r�s �r+r�z)infer_auto_device_map.<locals>.<listcomp>kr�r-z4 So those parameters need to be taken into account rPNzPutting z and z (size=�)z (available=z on c��g|] }|d��Sr|r�)r�r�s r+r�z)infer_auto_device_map.<locals>.<listcomp>�s��'G�'G�'G��!��'G�'G�'Gr-c3�4�K�|]\}\}}|�k�|V��dSr�r��r�rZr8r�r�s �r+r�z(infer_auto_device_map.<locals>.<genexpr>�s7��,u�,u�9�1�f�q�!�_`�dt�_t�_t�Q�_t�_t�_t�_t�,u�,ur-rzNot enough space on z to put z (space available z, needed size z).Fz Splitting r�c�(��g|]\}}��d|��|f��Sr7r�)r�r8r9r�s �r+r�z)infer_auto_device_map.<locals>.<listcomp>�s1��'h�'h�'h�4�1�a�,<�)B�)B�q�)B�)B�A�(F�'h�'h�'hr-c�,��g|]\}\}}|�k�|��Sr�r�r�s �r+r�z)infer_auto_device_map.<locals>.<listcomp>�s-��$m�$m�$m�9�1�f�q�!�WX�\l�Wl�Wl�Q�Wl�Wl�Wlr-r c�Z�g|](\}}t|tjj��$||f��)Sr�r�r�s r+r�z)infer_auto_device_map.<locals>.<listcomp>��3��[�[�[��1�J�q�%�(�/�<Z�<Z�[�a��V�[�[�[r-Tz?None of the tied module can be split, going to the next device.z, module size z6This module cannot be split, going to the next device.c�(��g|]\}}��d|��|f��Sr7r�r�s �r+r�z)infer_auto_device_map.<locals>.<listcomp>�s,��#T�#T�#T�4�1�a��]�]�q�]�]�A�$6�#T�#T�#Tr-c�Z�g|](\}}t|tjj��$||f��)Sr�r�r�s r+r�z)infer_auto_device_map.<locals>.<listcomp>�r�r-c�&�i|]\}}|dk�||��Sr|r��r�r4�mems r+rz)infer_auto_device_map.<locals>.<dictcomp>s*��_�_�_�+�&�#�WZ�]^�W^�W^�&�#�W^�W^�W^r-r/zCurrent model requires z� bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.� c3�.K�|]\}}d|�d|�d�V��dS)z - �: z bytes requiredNr�r�s r+r�z(infer_auto_device_map.<locals>.<genexpr>(sK��! �! �6A�f�c�1�6�1�1�S�1�1�1�! �! �! �! �! �! r-z{Based on the current allocation process, no modules could be assigned to the following devices due to insufficient memory: z� These minimum requirements are specific to this allocation attempt and may vary. Consider increasing the available memory for these devices to at least the specified minimum, or adjusting the model config.)!r�rrCr�r<�printr�r��nextr�r)r�r=r3r2r�rr>r�r�r�r�r3r�r�r�r rgr�r�r�r�rU)2r*rDr4rNrr�r�r�r�r�r�r�r,r�rAr��current_device�device_memory_used�device_buffer_sizes� device_minimum_assignment_memory�max_layer_size�max_layer_namesrvr�rr�r4�current_max_size�current_memory_reservedr�r�r��outputr��current_buffer_size�split_happened�tied_module�tied_module_childrenrB�fallback_module_name�fallback_module�remaining_modules�non_gpu_buffer_size�is_buffer_fit_any_gpu� gpu_device�gpu_max_memory�gpu_memory_used�devices_infor�r�s2 @@r+�infer_auto_device_maprs� ��~ $�E�:�7N�PU�We�f�f� ��J��N�:�:�'�:�:�:��')�$�'9�9I�<�Yp�&q�&q�#�N�O�� !� #� #�'�+�+�A�.�.��f�� 0��.�t�.�.�.�/�/�/�d�d�d�d�o�d�d�d��1�$�$�.@�W�W�$4�W�W�W��'�/�/�+�N�O�#�4�(�� -� � � �� Q�s�,�-�-��1�1��O�<M�O�O�P�P�P��f�f�f�f�Te�f�f�f�hj� � �� X�s�;�'�'�!�+�+��V��V�V�W�W�W��(��17�6�1A�1A�:�f�-�-�t��"#��>�"�l�2�2�/�.�@��&4�#�AZ��l�4D�B �B �>��0�,� �#�'9�&�'A�DY�'Y�]m�'m�'m�� *�D�*�*��$�7��9�&7�9�9�9�F�F��6��6�6�6�6�F�#�/��]�-=�@R�SY�@Z�-Z�]�]�]�]�F��*��*�*�*�*��f� � � ��v�&�&�&�*?�?�&�&�&� &�J�t��%6� 6� 6� �#�'G�'G�6F�'G�'G�'G�G�G�(,�,u�,u�,u�,u� �JZ�@[�@[�,u�,u�,u�(u�(u�%�$�(�(�):�;�;�;�06� �+�,�,�#� g�z�&�"�)�'D�'D� g�&F��%��'�'�'�#�/B�.E�.E�f�a�.P�.P�Sf�.f�#�F�+��{��a��$6�v�$>��$L�P`�$`�$`�� x�7�>�+B�x�x�D�x�x�Wh�x�x�!1�4F�v�4N�!N�x�x�^s�x�x�x��#�N�14�5F��1U�1U� � �-� �+�'+�K�,F�,F�,H�,H�'I�'I�$��+�,�,��1�1�[�5J�5S�Wn�5n�5n��<��:�'7�:�:�:�;�;�;�'+�K�,H�,H�QV�,H�,W�,W�'X�'X�[o�'o�$�'h�'h�'h�'h�Sg�'h�'h�'h�$�$m�$m�$m�$m� �BR�8S�8S�$m�$m�$m�no�$p�!��F�^�$�&�'9�(9�'9�:�;�*�+�'�'8�1�'<�'>�'>�?�@�!�3E�[�[�(8�[�[�[� �+�3�3�/�� "&�� Y��W�X�X�X��f�%��3�7G�G�G��f�b�l�3�3�3�7A�&�%�,�7W�7W�3��&�/�/�1�1�2�2� � � ��d�7�>�+B�d�d�D�d�d�'�*<�V�*D�D�d�d�T_�d�d�d��#�$�$��)�)�V�-=�-F�Ja�-a�-a��T��R�S�S�S��0��.�t�.�.�.�/�/�/�#'��(?�(?��(?�(N�(N�#O�#O�Rb�#b� �#T�#T�#T�#T�CS�#T�#T�#T�Wg�#g� �2D�[�[�(8�[�[�[� �+�3�3�/�� f�%��*�*�/B�*�v�QW�GW�GW� *�&�1�C��H]�4^�4^�^��GX� �� #5�f�#=�=�'��H�H�D� �/�3D��*�%9�?�$K�#L�QU�W]�P^�O_�#_�bs�#s� ��f�%��*�*�7L�Of�7f�,�V�4�&8��%?�BY�%Y��6�"��!��!�6�N�+�.>�>��A�� !� #� #�D`�_�9K�9Q�9Q�9S�9S�_�_�_��2�%�j�1�1� �-�1�1�%��;�;�>Q�>U�>U�V\�^_�>`�>`�`��Q�� %��*4�*:�*:�*<�*<� 1� 1�&�J��U�"�"�j�F�&:�&:��(� 1�"4�"8�"8��Q�"G�"G��!�%8�?�%J�J�J�,0�)��t�9�9�q�=�=�!6�=��M�)�*=�)�)�)� � � �(� ��y�y�! �! �Ee�Ek�Ek�Em�Em�! �! �! � � �� x�� x� x� x� � � ��r-c�^��d�|��D��}|��D],��dkr|��n�fd�|D��}�-t |��dkr'd�|��}t d|��dS)z� Checks a device map covers everything in a given model. Args: model (`torch.nn.Module`): The model to check the device map against. device_map (`Dict[str, Union[int, str, torch.device]]`): The device map to check. c��g|]\}}|��Sr�r��r�r�r�s r+r�z$check_device_map.<locals>.<listcomp>=s��H�H�H�'�$��H�H�Hr-rc�P��g|]"}|�k�|��dz�� |��#Sr7ra)r�r�r:s �r+r�z$check_device_map.<locals>.<listcomp>CsE��!�!�!��{�*�*�4�?�?�;�QT�CT�3U�3U�*��*�*�*r-rz, zOThe device_map provided does not give any device for the following parameters: N)� state_dictr r �clearr�r�rI)r*r��all_model_tensors�non_covered_paramsr:s @r+�check_device_mapr5s��I�H�U�-=�-=�-?�-?�-E�-E�-G�-G�H�H�H��!��(�(� � ��"��#�#�%�%�%��E�!�!�!�!�-�!�!�!�� !�!�!�Y�Y�'8�9�9��r�^p�r�r� � � �"�!r-c�P��|�d��rgt|d��5}|��}|��}ddd��n#1swxYwY|�"t�d|�d��ddi}|�d��dvrtd|�d ��|ddkrtd |d�d��|�t|��Stt|��dkrst|��d }|}t|t��r$t!��rd|��}nt#��rd}t||��Stt|��dhz ��}d|vr|�d��d�|D��|��D]0\�}||vr'�|��fd�|D��1�d��fd�|D��i}t+��r.t-dt/�fd�|D��dd d��} nd} |D]�}|}t|t��r$t!��rd|��}nt#��rd}t|d|��5}�|D]^} | �,| �|d��| �| ��|�| ��|| <| �| ��_ ddd��n#1swxYwY��| �| ��|St;j|t;jd��S)a3 Load a checkpoint from a given file. If the checkpoint is in the safetensors format and a device map is passed, the weights can be fast-loaded directly on the GPU. Args: checkpoint_file (`str`): The path to the checkpoint to load. device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*): A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the same device. z.safetensors�pt)� frameworkNz"The safetensors archive passed at zx does not contain metadata. Make sure to save your model with the `save_pretrained` method. Defaulting to 'pt' metadata.�format)r�tf�flaxzf does not contain the valid metadata. Make sure you save your model with the `save_pretrained` method.z%The checkpoint passed was saved with z, we need a the pt format.r rr�r�r�rPr/c��i|]}|g��Sr�r�r�s r+rz#load_state_dict.<locals>.<dictcomp>�s��?�?�?�V�f�b�?�?�?r-c�P��g|]"}|�ks|��dz�� |��#Sr7ra)r�rMr:s �r+r�z#load_state_dict.<locals>.<listcomp>�s;��h�h�h�q�A��4D�4D��U`�cf�Uf�Hg�Hg�4D��4D�4D�4Dr-c�\��g|](}|t��g��v�&|��)Sr�)r�r�)r�rM�device_weightss �r+r�z#load_state_dict.<locals>.<listcomp>�s;��)o�)o�)o��Q�c�R`�Rg�Rg�Ri�Ri�km�Nn�Nn�En�En�!�En�En�Enr-Fc�:��g|]}t�|��Sr�)r�)r�r4rs �r+r�z#load_state_dict.<locals>.<listcomp>�s&��Q�Q�Q�v�s�>�&�#9�:�:�Q�Q�Qr-�w)�main_process_only�total�unit� smoothing�leave)rr4)�dev�refresh)�map_location) rGr"rnr r�r�r2�OSErrorrI�safe_load_filer�r�r�rr)rErrrr �extendrrr��set_postfix�set_description� get_tensor�update�closer3�loadr4) �checkpoint_filer��frn�weight_namesr4� target_devicer��tensors�progress_barr$rr:s @@r+�load_state_dictr5Os��/�/�QM� ��$� 7� 7� 7� $�1��z�z�|�|�H��6�6�8�8�L� $� $� $� $� $� $� $� $� $� $� $�� $� $� $� $��K�K�o�_�o�o�o� � � �!�$�'�H��<�<��!�!�)=�=�=��I�_�I�I�I�� h� �4� '� '��s�X�h�EW�s�s�s�t�t�t��!�/�2�2�2��3�z�(�(�*�*�+�+�,�,��1�1��j�/�/�1�1�2�2�1�5�� &� ��f�c�*�*�.�'�)�)�.�(7�v�� )�+�+�.�(-� �%�o�m�L�L�L�L��3�z�0�0�2�2�3�3�v�h�>�?�?�G��G�#�#��u�%�%�%�@�?�w�?�?�?�N�'1�'7�'7�'9�'9� � �#��V��W�$�$�"�6�*�1�1�h�h�h�h�L�h�h�h�� 5�!�(�(�)o�)o�)o�)o�\�)o�)o�)o�p�p�p��G� �"�"� $�#�&+��Q�Q�Q�Q��Q�Q�Q�R�R�� $��!� 2� 2�� &� ��f�c�*�*�.�'�)�)�.�(7�v�� )�+�+�.�(-� ��$�}�U�U�U�2�YZ�-�f�5�2�2��'�3�(�4�4��4�O�O�O�(�8�8��=�=�=�'(�|�|�C�'8�'8��'�3�(�/�/�1�1�1�� 2�2�2�2�2�2�2�2�2�2�2�2��2�2�2�2��'��"�"�$�$�$��N��z�/��U�8K�8K�L�L�L�Ls$�)A�A#�&A#�'A(M�M �#M c��i}t��}|��D]�\}}|dkr� t|d��5|��}ddd��n#1swxYwYn#t$rt d��d�wxYw|D]g}||jt jd��kr|�|d|��z��A||}|�d��||d|��z<�h��|� ��D]}||vr|� |��|rt�d|��|S)z� Returns the state dictionary for an offloaded model via iterative onloading Args: model (`torch.nn.Module`): The offloaded model we want to save rr/Nz;Offloaded module must fit in CPU memory to call save_model!r|rQzMThe following tensors were not saved because they were still on meta device: ) r�r��align_module_devicer �MemoryErrorr4r3�addr�r;�remover�rY)r*r �placeholdersr�rv�module_state_dictr$�paramss r+�get_state_dict_offloaded_modelr>�s��J��5�5�L��+�+�-�-�<�<��f��2�:�:�� g�$�V�U�3�3� 8� 8�$*�$5�$5�$7�$7�!� 8� 8� 8� 8� 8� 8� 8� 8� 8� 8� 8�� 8� 8� 8� 8�� g� g� g��[�\�\�bf�f� g��%� <� <�C� ��%�,��V�0D�0D�D�D�� C� � �!1�2�2�2��&�s�+�F�+1�9�9�U�+;�+;�J�t�i�#�i�i�'�(�(� <�� "�"�%�%��*��$�$�$��w��u�gs�u�u�v�v�v��s/�A/�A#�A/�#A' �'A/�*A' �+A/�/B r/r �device_to_put_offloadc�6�|d|�d��}t|��sd}t||��5|��D]\}}|d|��z|vr|||d|��z<� ddd��n#1swxYwY|S)aL Retrieve the state dictionary (with parameters) from an offloaded module and load into a specified device (defaults to cpu). Args: module: (`torch.nn.Module`): The module we want to retrieve a state dictionary from module_name: (`str`): The name of the module of interest state_dict (`Dict[str, Union[int, str, torch.device]]`): Dictionary of {module names: parameters} device_to_put_offload (`Union[int, str, torch.device]`): Device to load offloaded parameters into, defaults to the cpu. NrQ)�rfind�has_offloaded_paramsr7r r )rvr:r r?�root�m_keyr=s r+�get_state_dict_from_offloadrE�s��*�/��*�*�3�/�/�/�0�D� ��'�'�%� $�� V�%:� ;� ;�8�8�#�.�.�0�0�6�6�8�8� 8� 8�M�E�6��{�5�{�{�"�z�1�1�17� �4�+�e�+�+�-�.�� 8�8�8�8�8�8�8�8�8�8�8�8��8�8�8�8� �s�AB�B�B� checkpointrm�offload_state_dict�keep_in_fp32_modules�offload_8bit_bnb�strictc ��#�|rddlm} t|��}t|��r-t |��dkrt �d��|�t||��|�'|�%d|��vrtd��|�.|�,d|��vrtj|d� ��t|t��r+|�d d��}tt |��}d}d} tj�|��r+t|��d��r|} �n�|g}�n�tj�|��rQd �tj|��D��}d�tj|��D��}t |��dkr(tj�||d��g}n�t |��dkr(tj�||d��g}n�d�tj|��D��}t |��dkr"t|�dt.�dt0�d��t |��dkr'tj�||d��} n%t|�d��td|�d��| ��tj�| ��d�#t5| ��5}t7j|��}ddd��n#1swxYwYd|vr|d}t=t?tA|��}�#fd�|D��}i}|rtCj"��}i}tA��}tA|�#��$��}d�|�%��D��}|D�]�}tM||��}|�P|�&|| ��|�'tA|�$��|z ��n,|�(��D�]\}}d|vr�||vr|�)|��| s�'|}t |��dkrK||vrGd�|�d��dd��}t |��dkr||v�G|dkrd|vrt|�d��||}|}|�Pt!j*|��r<|�:|t j+kr*d} |D]}!|!|vr|!dz|vs|!|krd} n�| rt j,}d|vr\|�dd��|�$��vr2|j-t j.kr||�dd��}"nd}"|dkrL|s||vrD|�|j-}|r| |||||||"��t_||d |�!��ta||||�"��|d#krH|rF|�|j-}|r| |||||||"��t_||d |�!��ta||||�"��t_||||||"�$��~tcj2��| sAt |��dkr.t �3d%|�d&|j4j5�d'|�d(��tm||��|r%to|||��tqj9|��tu||��dS))ax Loads a (potentially sharded) checkpoint inside a model, potentially sending weights to a given device as they are loaded. <Tip warning={true}> Once loaded across devices, you still need to call [`dispatch_model`] on your model to make it able to run. To group the checkpoint loading and dispatch in one single call, use [`load_checkpoint_and_dispatch`]. </Tip> Args: model (`torch.nn.Module`): The model in which we want to load a checkpoint. checkpoint (`str` or `os.PathLike`): The folder checkpoint to load. It can be: - a path to a file containing a whole model state dict - a path to a `.json` file containing the index to a sharded checkpoint - a path to a folder containing a unique `.index.json` file and the shards of a checkpoint. - a path to a folder containing a unique pytorch_model.bin or a model.safetensors file. device_map (`Dict[str, Union[int, str, torch.device]]`, *optional*): A map that specifies where each submodule should go. It doesn't need to be refined to each parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the same device. offload_folder (`str` or `os.PathLike`, *optional*): If the `device_map` contains any value `"disk"`, the folder where we will offload weights. dtype (`str` or `torch.dtype`, *optional*): If provided, the weights will be converted to that type when loaded. offload_state_dict (`bool`, *optional*, defaults to `False`): If `True`, will temporarily offload the CPU state dict on the hard drive to avoid getting out of CPU RAM if the weight of the CPU state dict + the biggest shard does not fit. offload_buffers (`bool`, *optional*, defaults to `False`): Whether or not to include the buffers in the weights offloaded to disk. keep_in_fp32_modules(`List[str]`, *optional*): A list of the modules that we keep in `torch.float32` dtype. offload_8bit_bnb (`bool`, *optional*): Whether or not to enable offload of 8-bit modules on cpu/disk. strict (`bool`, *optional*, defaults to `False`): Whether to strictly enforce that the keys in the checkpoint state_dict match the keys of the model's state_dict. r )�quantize_and_offload_8bitrr�NrPzeAt least one of the model submodule will be offloaded to disk, please pass along an `offload_folder`.T)�exist_okrrz.jsonc�(�g|]}|tk� |��Sr�)r�r�r0s r+r�z,load_checkpoint_in_model.<locals>.<listcomp>Ms"��V�V�V�Q�A��DU�DU�q�DU�DU�DUr-c�(�g|]}|tk� |��Sr�)r rOs r+r�z,load_checkpoint_in_model.<locals>.<listcomp>Ns#��%b�%b�%b�A�1�Pa�Ka�Ka�a�Ka�Ka�Kar-c�<�g|]}|�d��|��S)z.index.json)rGrOs r+r�z,load_checkpoint_in_model.<locals>.<listcomp>Us)��^�^�^�Q�A�J�J�}�D]�D]�^�q�^�^�^r-z6 is not a folder containing a `.index.json` file or a z or a z filezI containing more than one `.index.json` file, delete the irrelevant ones.z�`checkpoint` should be the path to a file containing a whole state dict, or the index of a sharded checkpoint, or a folder containing a sharded checkpoint or the whole state dict, but got rQ� weight_mapc�P��g|]"}tj��|��#Sr�)rkrlr�)r�r0�checkpoint_folders �r+r�z,load_checkpoint_in_model.<locals>.<listcomp>ns*��Y�Y�Y�1�B�G�L�L�):�A�>�>�Y�Y�Yr-c��g|]\}}|��Sr�r�rs r+r�z,load_checkpoint_in_model.<locals>.<listcomp>ys��>�>�>�W�T�1�D�>�>�>r-)r�)rJr�r:z doesn't have any device set.Fr�r|rr0r/)rxrNryz(Some weights of the model checkpoint at z! were not used when initializing r�z�. This may or may not be an issue - make sure that the checkpoint does not have unnecessary parameters, or that the model definition correctly corresponds to the checkpoint.);�bnbrLrr�r�r�r�r�r�rIrk�makedirsr)rYrr�r3rl�isfilerG�isdir�listdirr�rr r��open�json�loads�readrrr��tempfile�mkdtempr r r�r5r,r r9�is_floating_pointrfrcrNrir�r�gc�collectrYr�r�rrq�shutil�rmtreer)$r*rFr�rmrNrGr�rHrIrJrLr��checkpoint_files�index_filename�potential_state_bin�potential_state_safetensor�potential_indexr0r1� offload_index�state_dict_folder�state_dict_index�unexpected_keys� model_keys�buffer_namesr/�loaded_checkpointrr�r:�param_device� new_dtype�proceedr$ryrTs$ @r+�load_checkpoint_in_modelru�s��j�3�2�2�2�2�2�2�&�u�-�-�K�&�u�-�-� �#�k�2B�2B�a�2G�2G�� A� � � ��,�[�*�E�E�E��*�"8�V�z�GX�GX�GZ�GZ�=Z�=Z��s� � � � � #� �(>�6�Z�M^�M^�M`�M`�C`�C`� ��N�T�2�2�2�2��%��&�� h��+�+��u�%�%��N� �w�~�~�j�!�!� ��z�?�?�#�#�G�,�,� ,�'�N�N� *�|�� z� "� "� �V�V�"�*�Z�*@�*@�V�V�V��%b�%b��J�1G�1G�%b�%b�%b�"��"�#�#�q�(�(� "��Z�9L�Q�9O� P� P�Q�� +� ,� ,�� 1� 1� "��Z�9S�TU�9V� W� W�X��_�^�"�*�Z�*@�*@�^�^�^�O��?�#�#�q�(�(� �!�F�F�Ye�F�F�m~�F�F�F��_�%�%��*�*�!#��j�/�!�:L�!M�!M�� !�l�l�l�� v�hr� v� v� v� � � � �!��G�M�M�.�9�9�!�<�� .� !� !� )�Q��J�q�v�v�x�x�(�(�E� )� )� )� )� )� )� )� )� )� )� )�� )� )� )� )��5� � ��,�'�E�!�$�s�5�<�<�>�>�':�':�";�";�<�<��Y�Y�Y�Y�HX�Y�Y�Y��M��$�,�.�.��e�e�O��U�%�%�'�'�,�,�.�.�/�/�J�>�>��(;�(;�(=�(=�>�>�>�L�+�K�K��+�O� �S�S�S��!�!�"3�F�!�C�C�C��"�"�3�'8�'=�'=�'?�'?�#@�#@�:�#M�N�N�N�N�%6�%<�%<�%>�%>�A �A �!� �E��J�&�&��Z�/�/�#�'�'� �3�3�3�!�!� �(��+�&�&��*�*�{�*�/L�/L�"%�(�(�;�+<�+<�S�+A�+A�#�2�#�+F�"G�"G�K��+�&�&��*�*�{�*�/L�/L��"�$�$��:�)=�)=�$� �%Q�%Q�%Q�R�R�R�)�+�6��!� ��$��)@��)G�)G�$�+�7�E�U�]�<R�<R�"'��#7�&�&�C�!$� �!2�!2��s��j�9P�9P�VY�]g�Vg�Vg�*.�� %��Wh�#�6�(-� �I��z�)�)�j�.@�.@��5�.Q�.Q�Uf�Uk�Uk�Um�Um�.m�.m��{�e�j�0�0�*;�J�<N�<N�x�Y^�<_�<_�*`��&*�O��6�)�)�&� _�*�L�*H�*H�$�,�(-��I�+�d�5�5� %�u�j�)�^�Ub�ds��%�7��z�6�Yb�c�c�c�c�&�u�j�.�P]�^�^�^�^��!�U�*�*�/A�*� �(�$)�K� �'�e�1�1�!�5�*�i�AR�Td�fu��4�E�:�v�U^�_�_�_�_�&�u�j�:K�Sc�d�d�d�d�d�/��"�$�#�'�(7� �� c�/�*�*�Q�.�.�� x�z� x� x�"�_�5� x� x�9H� x� x� x� � � � �}�n�5�5�5��)��u�&6�8I�J�J�J�� '�(�(�(��U�K�(�(�(�(�(s�'L;�;L?�L?� native_amp�autocast_kwargsc��t��}|�i}n|��}|�r|jtjkrtd��rdn|jj}|jdkrtj d |tjd�|��S|jdvr�|jtjtj tjtjtjtjtjtjtjtjtjfvrtj d |tjd�|��Stj d d|i|��St/j��S) aH Return a context manager for autocasting mixed precision Args: native_amp (`bool`, *optional*, defaults to False): Whether mixed precision is actually enabled. cache_enabled (`bool`, *optional*, defaults to True): Whether the weight cache inside autocast should be enabled. NT��check_is_gpur�fp16)�device_typerN)�bf16�fp8r|r�)r� to_kwargs�distributed_typer�XLArr4r2�mixed_precisionr3�autocastrf�NO� MULTI_CPU� MULTI_GPU� MULTI_MLU� MULTI_SDAA� MULTI_MUSA� MULTI_NPU� MULTI_XPU� MULTI_HPU�FSDPre� contextlib�nullcontext)rvrw�stater|s r+�#get_mixed_precision_context_managerr��sO�� E��)�3�3�5�5��(��&�/�*=�=�=�BX�fj�Bk�Bk�Bk�=� �F��"� � � �F�*�*��>�b�k��b�b�Ra�b�b�b� � "�o� 5� 5�%�:P��%��%��%��&��&��%��%��%�� U �; �; ��>�c�k��c�c�Sb�c�c�c��>�M�M�k�M�_�M�M�M��%�'�'�'r-r�c��|tjkrddlm}|di|��St d��rddlm}|jdi|��St��rtj jjdi|��St��rtjjjdi|��St��rtjjjdi|��St��rtjjjdi|��St#��rtjjdi|��St%��rtjjd i|��St'dd ��rtjjdi|��Stjjjdi|��S)ah A generic helper which will initialize the correct `GradScaler` implementation based on the environment and return it. Args: distributed_type (`DistributedType`, *optional*, defaults to None): The type of distributed environment. kwargs: Additional arguments for the utilized `GradScaler` constructor. r)�ShardedGradScalerTryNr�r�rPz2.3rr�)r�)r�)r)rr��*torch.distributed.fsdp.sharded_grad_scalerr�r� torch_xla.amp�amp� GradScalerrr3rGrrHrrIrrFrrr r)r�r�r��xamps r+�get_grad_scalerr�s��?�/�/�/�P�P�P�P�P�P� � �*�*�6�*�*�*��4�0�0�0�7�$�$�$�$�$�$��t��(�(��(�(�(� � � �7��y�}�'�1�1�&�1�1�1� � � �7��z�~�(�2�2�6�2�2�2� � � �7��z�~�(�2�2�6�2�2�2� � � � 7��y�}�'�1�1�&�1�1�1� � � �7��y�#�4�4�V�4�4�4� � � �7��y�#�4�4�V�4�4�4��D�%�(�(� 7��9�'�9�9�&�9�9�9��:�>�,�6�6�v�6�6�6r-c�p�ddlm}t|d��o t|j|��o|jjS)a\ Checks if a module has offloaded parameters by checking if the given module has a AlignDevicesHook attached with offloading enabled Args: module (`torch.nn.Module`): The module to check for an offload hook. Returns: bool: `True` if the module has an offload hook and offloading is enabled, `False` otherwise. r)�AlignDevicesHook�_hf_hook)�hooksr�r�r)r��offload)rvr�s r+rBrB(sD��)�(�(�(�(�(��6�:�&�&�t�:�f�o�GW�+X�+X�t�]c�]l�]t�tr-�execution_devicec#�K�t|��r�|�|jj}||j_ |j�|��dV�|j�|d��|�||j_dSdS#|j�|d��|�||j_wxYw|��d�|�d��D��} |D]}t |||��dV�|��D]\}}t |||��dS#|��D]\}}t |||��wxYwdV�dS)a~ Context manager that moves a module's parameters to the specified execution device. Args: module (`torch.nn.Module`): Module with parameters to align. execution_device (`torch.device`, *optional*): If provided, overrides the module's execution device within the context. Otherwise, use hook execution device or pass Nc�$�i|] \}}||j��Sr�r�rs r+rz'align_module_device.<locals>.<dictcomp>Rs ��`�`�`�+�$��4��`�`�`r-Fr�)rBr�r��pre_forward�post_forwardr�r�r )rvr��original_devicer�r�r4s r+r7r78s��F�#�#��'�$�o�>�O�/?�F�O�,� C��O�'�'��/�/�/��E�E�E��O�(�(��6�6�6��+�3B��0�0�0�,�+�� O�(�(��6�6�6��+�3B��0�B�B�B�B� � %�`�`��9P�9P�Y^�9P�9_�9_�`�`�`�� B�� L� L��+�F�D�:J�K�K�K�K��E�E�E� '� � �� B� B��f�+�F�D�&�A�A�A�A� B� B�� B� B��f�+�F�D�&�A�A�A�A� B�� s�A8�8+B#� D�-D=)NNNN)TFF)F)NNF)NNr�)r)NNNNF)NNNNFTFF)r/)NNNFFNFF)FN)sr�rbr�r�r\�loggingrkrWrdr_r��collectionsrr�typingrrr3�torch.nnr�r�r� constantsr r�dataclassesrr r�importsrrrrrrrrr�memoryrrr�rrrrr�versionsrr � torch_npu� torch_mlu� torch_sdaa� torch_musa�safetensorsr"�safetensors.torchr#r'�WEIGHTS_INDEX_NAME� getLoggerr�r�r,r7rErYrMrNr]r�r�r4rur=� HalfTensor�dictr�rRr�r�rr�r�r�r�rrrr0r3rCr^rgrqrxr�r�r�r�r�rrr5r>r^rE�PathLikerur�r�rB�contextmanagerr7r�r-r+�<module>r�s� �� 0�0�0�0�0�0�0�0�"�"�"�"�"�"�"�"��$�$�$�$�$�$�6�6�6�6�6�6�6�6�E�E�E�E�E�E�E�E�E�E� � � � � � � � � � � � � � � � � � � � � � �A�@�@�@�@�@�@�@�N�N�N�N�N�N�N�N�N�N�)�)�)�)�)�)�)�)�8�8�8�8�8�8�8�8��'�'�'��'�'�'��%�(�(�(��%�(�(�(��!�!�!�!�!�!�9�9�9�9�9�9�4�� 8� $� $��]�]�]�)�)�)�6)�5��c��?�)�)�)�)�X�5�;��8!4�e�l�!4�u�U�\�3��5K�/L�!4�!4�!4�!4�P%)�/3�26�MQ�N>�N>��I�N>��N>� �#�s�E�L�(�)�N>��E�L�!� N>� �E�#�u�{�*�+�,�N>��e�.�/� N>��d�3��U�\�5�<�-G�(H�#H�I�J�N>�N>�N>�N>�dkp�#�#��I�#�(,�#�>B�#�cg�#�#�#�#�<&�&�r�y�&�4�&�&�&�&�&.�.�.�.�.�t�.�.�.�"U�2�9�U�U�U�U�@;�;�;��.4x��4x�4x�4x�4x�n:�:�:�>�U�3��#4�5��%�+��15�DH�� '�'� �9�'��E�#�u�|�+�,�-�'��T�#�u�S�%�,�->�'?�"?�@�A�'�� '�'�'�'�X15�DH� #� #� �9� #��E�#�u�|�+�,�-� #��T�#�u�S�%�,�->�'?�"?�@�A� #� #� #� #�$!� �%��U�X�_�,�-� .�$!�>B�3��8�n�$!�gk�lo�gp�$!�$!�$!�$!�Nl�l�x��U�3��8�_�e�C��H�o�-M�(N�O�l�l�l�l�^��c�5��c�5�<�1G�+H�&H�!I��X[��,m�m�m�<��DH�37�/3�DH�� B�B� �9�B��e�C��H�o�u�S�#�X��>�?�@�B�&�d�3�i�0�B��E�#�u�{�*�+�,� B� �T�#�u�S�%�,�->�'?�"?�@�A�B�� B�B�B�B�J%�5�8�?�%�%�%�%�(DH�37�/3�DH�>�>� �9�>��e�C��H�o�u�S�#�X��>�?�@�>�&�d�3�i�0�>��E�#�u�{�*�+�,� >� �T�#�u�S�%�,�->�'?�"?�@�A�>��s�C�x��s�C�x��%��S��/� )�*��s�C�x��I��c��N��c��O��I��s�B�I�~� ��!� � >�>�>�>�B!B� �3��S� �4�� ?�*�+�!B�!B�!B�!B�P48�15�i!�i!� �%��R�Y��'� (�i!��s�C�x�.�i!��c�3�h��i!�&�d�3�i�0� i!� �d�4��9�o�.�i!��8�C�=�(�2�9�-�t�E�#�r�y�.�4I�/J�J�K� i!�i!�i!�i!�\DH�37�/3�CG��!� %�q�q� �9�q��e�C��H�o�u�S�#�X��>�?�@�q�&�d�3�i�0�q��E�#�u�{�*�+�,� q� �T�#�u�S�%�+�-=�'>�">�?�@�q�� q��q��q��q�q�q�q�h �B�I� �4��U�3��U�\�CY�=Z�8Z�3[� � � � �4\M�\M�\M�\M�~"�"�)�"�"�"�"�R<A� !�!��I�!��!��S�%��U�\� 1�2�2�3�!�!��c�5�<�!7�8� !�!�!�!�NFJ�8<�/3�$�!�&*�"��])�])� �9�])��c�2�;�&�'�])��c�5��c�5�<�)?�#@�@�A�B�])��U�3��#3�4�5� ])� �E�#�u�{�*�+�,�])�� ])��])��s�)�])��])� �])�])�])�])�@((�((�D�((�Sa�((�((�((�((�V#7�#7�o�#7�#7�#7�#7�L u�� u�T� u� u� u� u� ��#�#��#�8�E�L�CY�#�#�#��#�#�#r-