� ���gQ�����dZddlZddlZddlZddlZddlZddlZddlZddl Z ddl Z ddl Z ddl Z ddl Z ddlZddlZddlZddlZddlmZddlmZddlmZmZmZmZmZddlmZddlmZ ddl!Z"ddl#Z#ddl$m%Z&ddl'm(Z(m)Z)m*Z*dd l+m,Z,dd l#m-Z-dd l.m/Z/m0Z0m1Z1m2Z2m3Z3dd l4m5Z5dd l6m7Z7ddl8m9Z9m:Z:m;Z;ddl<m=Z=m>Z>ddl?m@Z@ddlAmBZBddlCmDZDmEZEddlFmGZGddlHmIZImJZJmKZKddlLmMZMddlNmOZOddlPmQZQmRZRmSZSddlTmUZUmVZVddlWmXZXmYZYddlZm[Z[ddl\m]Z]m^Z^ddl_m`Z`ddlambZbmcZcmdZdmeZemfZfmgZgmhZhmiZiddljmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrmsZsmtZtmuZumvZvmwZwmxZxmyZymzZzm{Z{m|Z|m}Z}m~Z~ddlm�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�dd l�m�Z�m�Z�m�Z�dd!lm�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�dd"l�m�Z�dd#l�m�Z�ecgZ�efZ�e���rdd$l�m�Z�e�Z�e���rdd%l�m�Z�e���rddl�Z�e���rLddl�m�cm�Z�ddl�m�cm�Z�dd l�m5Z�e,j�eئ�e,j�e���kZ�e�rddl�m%cm�Z�ddl�m�Z�nd&Z�e���r>ddl�m�cm#Z�dd l�m5Z�e,j�e��e,j�d'��kZ�dd(ljm�Z�m�Z�m�Z�m�Z�nd&Z�e���rddl�Z�e���rdd)l�m�Z�e���r�dd*l�m�Z�m�Z�dd l�m5Z�dd+l�m�Z�dd,l�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�m�Z�e2gZ�e,j�e��e,j�d-��krdd.l�m�Z�e,j�e��e,j�d/��krdd0l��m�Ze��egz Z�eK��rdd1l��m�Ze�d2��rdd3l��m�Zd4��Zd5��Zd6��Zerdd�l�Ze���rddl�Z�ej�e���Z d7�Z d8�Z d9�Z d:�Z d;�Zd<�Zd=�ZGd>�d?���ZdS)@uc The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task. �N)�Mapping)�Path)� TYPE_CHECKING�Any�Callable�Optional�Union�)�#get_reporting_integration_callbacks)� ModelCard� create_repo� upload_folder)�version)�nn)� DataLoader�Dataset�IterableDataset� RandomSampler�SequentialSampler)� __version__)�PretrainedConfig)� DataCollator�DataCollatorWithPadding�default_data_collator)� DebugOption�DebugUnderflowOverflow)�SequenceFeatureExtractor)�FeatureExtractionMixin)�"ALL_HYPERPARAMETER_SEARCH_BACKENDS�default_hp_search_backend)�BaseImageProcessor)�deepspeed_init�deepspeed_load_checkpoint�is_deepspeed_available)�tpu_spmd_dataloader)�TrainingSummary)�PreTrainedModel�load_sharded_checkpoint� unwrap_model)�!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES�MODEL_MAPPING_NAMES)� Adafactor� get_scheduler)�ProcessorMixin)�ALL_LAYERNORM_LAYERS�"is_torch_greater_or_equal_than_2_3)�PreTrainedTokenizerBase)�CallbackHandler�DefaultFlowCallback�ExportableState�PrinterCallback�ProgressCallback�TrainerCallback�TrainerControl� TrainerState)�DistributedTensorGatherer�EvalLoopContainer�IterableDatasetShard� LabelSmoother�LayerWiseDummyOptimizer�LengthGroupedSampler�SequentialDistributedSampler�distributed_broadcast_scalars�distributed_concat�find_batch_size�get_model_param_count�get_module_class_from_name�get_parameter_names� nested_concat� nested_detach�nested_numpify�nested_xla_mesh_reduce�reissue_pt_warnings�remove_dummy_checkpoint�set_rng_state_for_device)�PREFIX_CHECKPOINT_DIR�BestRun�EvalLoopOutput�EvalPrediction�HPSearchBackend� HubStrategy�PredictionOutput�RemoveColumnsCollator� SaveStrategy�TrainerMemoryTracker� TrainOutput�check_target_module_exists�default_compute_objective�denumpify_detensorize�enable_full_determinism�find_executable_batch_size�get_last_checkpoint� has_length�neftune_post_forward_hook�number_of_arguments� seed_worker�set_seed� speed_metrics)�OptimizerNames� ParallelMode�TrainingArguments))�ADAPTER_CONFIG_NAME�ADAPTER_SAFE_WEIGHTS_NAME�ADAPTER_WEIGHTS_NAME� CONFIG_NAME�SAFE_WEIGHTS_INDEX_NAME�SAFE_WEIGHTS_NAME�WEIGHTS_INDEX_NAME� WEIGHTS_NAME�XLA_FSDPV2_MIN_VERSION�PushInProgress�PushToHubMixin�can_return_loss� find_labels�is_accelerate_available�is_apex_available�is_apollo_torch_available�is_bitsandbytes_available�is_datasets_available�is_galore_torch_available�is_grokadamw_available�is_in_notebook�is_ipex_available�is_liger_kernel_available�is_lomo_available�is_peft_available�is_safetensors_available�is_sagemaker_dp_enabled�is_sagemaker_mp_enabled�is_schedulefree_available�is_torch_compile_available�is_torch_hpu_available�is_torch_mlu_available�is_torch_mps_available�is_torch_musa_available�is_torch_neuroncore_available�is_torch_npu_available�is_torch_xla_available�is_torch_xpu_available�is_torchao_available�logging� strtobool)�deprecate_kwarg)�QuantizationMethod)�NotebookProgressCallback)�ampFz1.10)�smp_forward_backward�smp_forward_only� smp_gather�smp_nested_concat)� PeftModel)� Accelerator�skip_first_batches)�AcceleratorState)�AutocastKwargs�DistributedDataParallelKwargs�DistributedType�load_fsdp_model�load_fsdp_optimizer�save_fsdp_model�save_fsdp_optimizer�1.3.0)�TorchTensorParallelPluginz0.23.0)�SeedableRandomSampler)�DeepSpeedSchedulerWrapper�0.28.0)�DataLoaderConfigurationc��t��r|t��rtfnd}tjtj�d����tjd��kr ddlm}g|�|�R}t||��SdS)N��peftz0.7.0r)�PeftMixedModelF) r�r�r�parse� importlib�metadatar�r�� isinstance)�model�classes_to_checkr�s �d/home/asafur/pinokio/api/open-webui.git/app/env/lib/python3.11/site-packages/transformers/trainer.py�_is_peft_modelr�s������3�+<�+>�+>�F�I�<�<�B�� �=��+�3�3�F�;�;� <� <�� �g�@V�@V� V� V� +� +� +� +� +� +�B�!1�B�>�B�B� ��%�!1�2�2�2� �5�c��t��r1dttjt��j��vrddiSiS)N� adapter_onlyT)ru�list�inspect� signaturer�� parametersr�r�r��_get_fsdp_ckpt_kwargsr�s@��� � ��^�t�G�<M�o�<^�<^�<i�7j�7j�%j�%j���%�%�� r�c��tjtj��jtjd��jkrt j��Stjtj��tjd��kr tjn tj }|j j tj tj g}|ttj tj����gz }tj�|��S)Nz2.6�2.0.0)rr��torchr�release� contextlib� nullcontext�np�_core�core� multiarray� _reconstruct�ndarray�dtype�type�uint32� serialization� safe_globals)�np_core� allowlists r�r�r�s����}�U�&�'�'�/�'�-��2F�2F�2N�N�N��%�'�'�'�!�-���7�7�7�=��;Q�;Q�Q�Q�b�h�h�WY�W^�G��#�0�"�*�b�h�G�I��$�r�x�� �*�*�+�+�,�,�I� � � +� +�I� 6� 6�6r�ztraining_args.binztrainer_state.jsonz optimizer.ptz scaler.ptz optimizer.binz scheduler.pt�pytorch_model_fsdpc� �eZdZdZddlmZmZmZmZm Z e dddd��� d�d e e e jd fd ed eedee eedfdee eeeefdfdee eeeefdeege fdeedeeegefdeeedeeej j!eej j"j#fdeee$ej j!eee%ffdeeej&ej&gej&ffd���Z'e(deefd���Z)e)j*d�d���Z)d�Z+d�Z,d�Z-d�Z.d �Z/d!�Z0d"�Z1d�d#dd$eefd%�Z2 d�d ed$eedefd&�Z3deej4j5j6fd'�Z7de8fd(�Z9dedeej4j5j6fd)�Z:d�dee eefde8fd*�Z;d+ede8fd,�Z<d-e=fd.�Z>deefd/�Z?d0�Z@d1�ZAd2�ZBd�d3ee eej jCjDffd4�ZEeF d�d ed ee dee%e%ffd5���ZGd�d-e=d6ej j!fd7�ZHd8e8de=fd9�ZIeFd�d:e8d;ee=de=fd<���ZJd=e d>eee%fffd?�ZKd=e d>eee%ffd@e=dAeeeLffdB�ZMdCefdD�ZNd�dE�ZOd�dG�ZPdFejQfdH�ZRdI�ZSd�dJ�ZT d�dKee eeUfd=e d>eee%fd fdLeeefdM�ZV d�dN�ZWdO�ZXd�dP�ZYdQ�ZZdR�Z[d�dS�Z\ d�dT�Z]dU�Z^dV�Z_dW�Z`dX�ZadY�ZbdZ�Zcd[�Zdd\�Zed]�Zf d�d`eed>geeeLffdaeeeeeLfgeLfdbe=dce eeefddee deegfdfeed>gefde eheehffdg�Zid�dheeeLfdieeLdd fdj�Zjdke ej&e%fde ej&e%ffdl�Zkdmeee ej&e%ffdeee ej&e%fffdn�Zldo�Zmd�dpeeUfdq�Zn d�d e jdmeee ej&e%ffdej&fdr�Zod�ds�ZpdeUfdt�ZqdeUfdu�Zrd�dveedweUfdx�Zsd�dveefdy�Ztd dveefdz�Zud{�Zvd ewdFfdeefd|�Zxd�d�d}�Zy d�dee eeeeffdeeed�edeeeLffd��Zz d�d+edeeed�ede{fd��Z| d�d8e8d$ed�eeUdeeed�ede}f d��Z~d�d��Z d�d e jdmeee ej&e%ffd�eUdeeedeeej&eej&eej&ff d��Z�dmeee ej&e%fffd��Z�d�d�eefd��Z� d�d�eed�eed�e eeed fd�eed�eed�e eeed fd�e eeed fd#e eeed fd�e eeed ffd��Z�d��Z�d��Z� d�d�eed�eUd�eed�eedef d��Z� d�d8e8d$ed�eeUdeeed�ede}f d��Z�d��Z�d�d��Z�d��Z�d�d��Z�d��Z�d��Z�d ed8e8d�e=fd��Z�d S)��TraineruA Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers. Args: model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*): The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed. <Tip> [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers models. </Tip> args ([`TrainingArguments`], *optional*): The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided. data_collator (`DataCollator`, *optional*): The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will default to [`default_data_collator`] if no `processing_class` is provided, an instance of [`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer. train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*): The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally sets the seed of the RNGs used. eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`, `datasets.Dataset`]), *optional*): The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each dataset prepending the dictionary key to the metric name. processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): Processing class used to process the data. If provided, will be used to automatically process the inputs for the model, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. This supercedes the `tokenizer` argument, which is now deprecated. model_init (`Callable[[], PreTrainedModel]`, *optional*): A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start from a new instance of the model as given by this function. The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to be able to choose different architectures according to hyper parameters (such as layer count, sizes of inner layers, dropout probabilities etc). compute_loss_func (`Callable`, *optional*): A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618) used by [`Trainer`]. compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*): The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered after the last eval batch to signal that the function needs to calculate and return the global summary statistics rather than accumulating the batch-level statistics callbacks (List of [`TrainerCallback`], *optional*): A list of callbacks to customize the training loop. Will add those to the list of default callbacks detailed in [here](callback). If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method. optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*): A tuple containing the optimizer class and keyword arguments to use. Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument. Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer. preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*): A function that preprocess the logits right before caching them at each evaluation step. Must take two tensors, the logits and the labels, and return the logits once processed as desired. The modifications made by this function will be reflected in the predictions received by `compute_metrics`. Note that the labels (second parameter) will be `None` if the dataset does not have them. Important attributes: - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`] subclass. - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`, the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`. - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from data parallelism, this means some of the model layers are split on different GPUs). - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set to `False` if model parallel or deepspeed is used, or if the default `TrainingArguments.place_model_on_device` is overridden to return `False` . - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while in `train`) r )�_get_learning_rate� log_metrics�metrics_format� save_metrics� save_state� tokenizer�processing_classz5.0.0T)�new_namer�raise_if_both_namesN�NNr��args� data_collator� train_datasetzdatasets.Dataset� eval_dataset� model_init�compute_loss_func�compute_metrics� callbacks� optimizers�optimizer_cls_and_kwargs�preprocess_logits_for_metricsc ���|�0d}t�d|�d���t|���}|jr>| �<dt j| ��j���vrtd���|j �%|j dkr|�td|j �d ����|j tj ks|j r|j�td ���||_||_|jjrt%|jj��nt)|jj��d|_d|_d |_|���t3|jj��|_|j���|���}t=j|��|j |�-|�||_!|�"��}n2tGd ���|�tIj%d tL��||_!|j'j(tRvrtd|j'j(�d����tU|dd ��rtU|dd ��rd|_+nd |_+tU|dd����d�tY|j-�.����D��}t_|��dkrd|_+nHt_|��dkr.|jj0tcj0|d��k|_+nd |_+|j+rt�d��|jj2r�tg��r�ddl4m5}tm|tn��r ||���n�tq|d��rFtm|�9��tn��r||�9�����n*t�:d��ntwd���tU|dd ��otU|dd �� }tU|dd��duo |j<j=}tU|dd��duotU|j<d d ��}|rtq|d!��rtd"���|r t}|��s|std#���|r5|s3td$|j<j?j@�d%|j<j?j@�����|jAd&|_Bt_|jC��dkrG|jDrtd'���|jAd&s$|jEt�jGkrtd(���|jH|_H|j+s*|jDs#|jIs|jJr|jKr|jBs|jLrd |_H|�+tm|t�t�f��rt�|��nt�}|�|n||_Q||_R||_S||_T|jHr:tU|d)d��t�jVks|�W||j0��|j+r d|j_X||_Y||_Z|j[�\|��}t}|��s|j]n|�9��j]}t j|��j}tq|d*��r |j^|__n0t�d+�|�.��D����|__|ja|_a| |_b| |_c| \|_d|_e| |_f|jf�|jd�tGd,���|�|jd�|je�tGd-���t���r||jd�u|jZ���D] }|j0}|jdjhD]0}t_|d.��dkr|d.dj0}n�1||krtd/���|jBs|jLr|jd�|je�tGd0���t�t�|jjk��z}| �|n|| z} t�| |jZ|jT|jd|je��|_m|�n|jjort�nt���d |_rd|_s|jjtr|�u��|jjvr t�jx|jjyd�1��t�|jQ��s2t�tU|jQd2d����rtd3���|j{dkr%|j|dkrt�d4��|�)t�|��s|j{dkrtd5���|�:tm|tbj~jj���r|j�rtd6���d|_�d |_�d |_��t ���r|j�rtd7����tr�|j��tj�j�j�krnt�:d8�tj�j�j��d9|j��d:�tj�j�j������tj�j�j�|_�nStq�tj�j�d;��r3t�:d8�tj�j�j��d<���|j�s|j�rq|j�d=krf|j0tcj0d>��kr&|j�r�tstd?���nd@|_�t�dA|j��dB���|j�s|j�rj|jDsc�t ��sT|j�d@krd|_�tbj�|_�n0|j�dCkr%�t!��stwdD���d|_�|jj�dkr!�t%|jj��E��|_�nd|_��t)��|_��t-|����|����dF�|jmj�|j�gzD���G��|_�d|_�d|_�t}|jZ��r9|jj��-t�:dH|jZj'j(�dI����t;|jZj'��}|jj��|n |jj�|_��t=|jZj'��|_�|jm��|j|j�|j���|_�|j�|_�d |_�|j����|j�r�tK��stGdJ���|jA��dKd ��|_�|j�ry�tPstdL����tSj���}�tWj��tWj��t]j��ta|����|dfdM�N����|jBo|j� |_�dS)ON� tmp_trainerz1No `TrainingArguments` passed, using `output_dir=z`.)� output_dir�compute_resultz�When using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result` boolean argument which will be triggered after the last batch of the eval set to signal that the summary statistics should be returned by the function.�noz%You have set `args.eval_strategy` to zx but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. z�`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`.Fz<`Trainer` requires either a `model` or `model_init` argumentz�`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will overwrite your model when calling the `train` method. This will become a fatal error in the next release.zThe model you have picked (a) cannot be used as is for training: it only computes hidden states and does not accept any labels. You should choose a model with a head suitable for your task like any of the `AutoModelForXxx` listed at https://huggingface.co/docs/transformers/model_doc/auto�is_parallelizable�model_parallelT� hf_device_mapc��g|]}|dv�|�� S))�cpu�diskr�)�.0�devices r�� <listcomp>z$Trainer.__init__.<locals>.<listcomp>�s$��o�o�o�&�QW�_n�Qn�Qn�v�Qn�Qn�Qnr�r rz�You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.)�_apply_liger_kernel_to_instance�r��get_base_modelzRThe model is not an instance of PreTrainedModel. No liger kernels will be applied.z�You have set `use_liger_kernel` to `True` but liger-kernel >= 0.3.0 is not available. Please install it with `pip install liger-kernel`� is_quantized�_hf_peft_config_loaded� hf_quantizer�is_qat_trainable� _orig_modz�You cannot fine-tune quantized model with `torch.compile()` make sure to pass a non-compiled model when fine-tuning a quantized model with PEFTz�You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft for more detailsz8The model you are trying to fine-tune is quantized with z� but that quantization method do not support training. Please open an issue on GitHub: https://github.com/huggingface/transformers to request the support for training support for �xlazZUsing --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags.z.Using fsdp only works in distributed training.�quantization_method�accepts_loss_kwargsc3�JK�|]}|jtjjkV��dS�N)�kindr�� Parameter� VAR_KEYWORD)r��ks r�� <genexpr>z#Trainer.__init__.<locals>.<genexpr>~s@����1�1�<=���'�+�7�7�1�1�1�1�1�1r�zSPassing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.z�Passing a `model_init` is incompatible with providing the `optimizers` argument. You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.�paramsa[The model and the optimizer parameters are not on the same device, which probably means you created an optimizer around your model **before** putting on the device and passing it to the `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and `model.to(xm.xla_device())` is performed before the optimizer creation in your script.z�Passing `optimizers` is not allowed if PyTorch FSDP is enabled. You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method.��exist_ok� collate_batchzRThe `data_collator` should be a simple callable (function, class with `__call__`).zHmax_steps is given, it will override any value given in num_train_epochsz�The train_dataset does not implement __len__, max_steps has to be specified. The number of steps needs to be known in advance for the learning rate scheduler.zTthe `--group_by_length` option is only available for `Dataset`, not `IterableDatasetzOSageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead z(FP16 provided in SM_HP_MP_PARAMETERS is z+, but FP16 provided in trainer argument is z , setting to �fp16zJ, but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer.�autor�z2Tried to use `fp16` but it is not supported on cpu�cpu_ampzUsing z half precision backend�apexzcUsing FP16 with APEX but APEX is not installed, please refer to https://www.github.com/nvidia/apex.)�epsilonc�<�g|]}t|t���|��Sr��r�r4�r��cbs r�r�z$Trainer.__init__.<locals>.<listcomp>�:�� � � ��Q[�\^�`o�Qp�Qp� �� � � r�)�is_local_process_zero�is_world_process_zero�stateful_callbacksz)No label_names provided for model class `z�`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.z3Using torch.compile requires PyTorch 2.0 or higher.� xla_fsdp_v2z*FSDPv2 requires `torch_xla` 2.2 or higher.)�fsdp�tensor)� axis_names)��logger�inforg�batch_eval_metricsr�r�r��keys� ValueError� eval_strategy� save_strategyrV�BEST�load_best_model_at_end�metric_for_best_modelr�r��full_determinismr\�seedrc�hp_name� deepspeed� is_in_train�"create_accelerator_and_postprocessrW�skip_memory_metrics�_memory_tracker�start�get_process_log_levelr�� set_verbosity�_setup_devicesr��call_model_init� RuntimeError�warnings�warn� FutureWarning� __class__�__name__r+�getattr�is_model_parallel�setr��values�lenr�r��use_liger_kernelr~�liger_kernel.transformersr�r�r'�hasattrr��warning� ImportErrorr�� is_trainabler��quantization_config� quant_method� fsdp_config�is_fsdp_xla_enabledr�is_deepspeed_enabled� parallel_moderf� DISTRIBUTED�place_model_on_device�fp16_full_eval�bf16_full_eval�do_train�is_fsdp_enabledr1rrrr�r�r�r�r��BITS_AND_BYTES�_move_model_to_device�_n_gpu� model_wrappedr�� acceleratorr)�forwardr�model_accepts_loss_kwargs�any�neftune_noise_alphar�r�� optimizer� lr_schedulerr�r�� param_groups�DEFAULT_CALLBACKSr � report_tor2�callback_handler� add_callback� disable_tqdmr5�DEFAULT_PROGRESS_CALLBACK�_loggers_initialized� hub_model_id� push_to_hub� init_hf_repo� should_save�os�makedirsr��callable� max_steps�num_train_epochsr_�utils�datar�group_by_length�_signature_columns�use_apex� use_cpu_ampr��bf16�IS_SAGEMAKER_MP_POST_1_10r �smp�state�cfg�half_precision_backendr0�bfloat16� amp_dtyperv�label_smoothing_factorr=�label_smootherr8�controlr9rrr�� current_flos�hp_search_backend� label_namesrtrs� on_init_end�train_batch_size�_train_batch_size�_created_lr_scheduler�stop_and_update_metrics� torch_compiler��get�is_fsdp_xla_v2_enabled�IS_XLA_FSDPV2_POST_2_2�xr�global_runtime_device_count�xs�set_global_mesh�Meshr��array�range�is_fsdp_xla_v1_enabled) �selfr�r�r�r�r�r�r�r�r�r�r�r�r�r�� log_level�devicesr��_is_quantized_and_base_model�&_quantization_method_supports_training�%_is_model_quantized_and_qat_trainable�default_collator�unwrapped_model� model_forward�forward_params�param� model_device� param_group�optimizer_device�default_callbacks�default_label_names� num_devicess r��__init__zTrainer.__init__�s���& �<�&�J� �K�K�Z�J�Z�Z�Z� [� [� [�$� �;�;�;�D� � "� ��'B��w�'8��'I�'I�'T�'Y�'Y�'[�'[�[�[� �N���� � � )�d�.@�D�.H�.H�\�Ma��E��8J�E�E�E��� � � ��!2� 2� 2�d�6Q� 2��)�1� �Z������ �!2���37�9�3M�k��� ��/�/�/�S[�\`�\e�\j�Sk�Sk���� ���� ��� �/�/�1�1�1� 4�D�I�4Q�R�R��� ��"�"�$�$�$��.�.�0�0� ���i�(�(�(� ��� �=��%�",����,�,�.�.���"�#a�b�b�b��%�� � �"� ��� )�D�O� �?� #�':� :� :��J�e�o�.F�J�J�J��� � �5�-�u� 5� 5� +�'�%�IY�[`�:a�:a� +�%)�D� "� "�%*�D� "� �5�/�4� 0� 0� <�o�o�C��0C�0J�0J�0L�0L�,M�,M�o�o�o�G��7�|�|�a���)-��&�&��W����"�"�)-��)9�U�\�'�RS�*�=U�=U�)U��&�&�).��&��%� �� � �h���� �9� %� �(�*�*� �U�U�U�U�U�U��e�_�5�5� �3�3�%�@�@�@�@�@��U�$4�5�5��*�U�EY�EY�E[�E[�]l�:m�:m��3�3�%�:N�:N�:P�:P�Q�Q�Q�Q�Q��N�N�l�����"�H���� (/�u�n�e�'L�'L�( �U\� �+�U�V �V �R �$� �E�>�4� 0� 0�� <� `��AS�A`� /�18��~�t�0T�0T�\`�0`�1 �el� � � 2�E�f �f �-� (� �G�E�;�,G�,G� ��b��� � (� ��u�0E�0E� �Ns� ��$��� � *� �2X� ��z�5�K]�Kq�K~�z�z�DI�DV�Dj�Dw�z�z��� � $(�#3�E�#:�� � �t�y�>�>�A� � ��(� � �p�����#�E�*� S�t�/A�\�E]�/]�/]� �!Q�R�R�R�&*�%?��"� � "� /��(� /��$� /�)-�(;� /�FJ�]� /��'�  /� �#�  /�*/�D� &� �+��+�.E�G_�-`�a�a�,� $�$4� 5� 5� 5�'� � /<�.G�]�]�M]���*���(��� 0��� � &� ;��E�#8�$�?�?�CU�Cd�d�d� � &� &�u�d�k� :� :� :� � !� !� �D�I� �#����� ��*�7�7��>�>��"�/�2�2� :�O� #� #� �/�/�1�1�9� � !�*�=�9�9�D�� �5�/� 0� 0� �-2�-F�D� *� *�-0�1�1�AO�AV�AV�AX�AX�1�1�1�.�.�D� *�$(�#;�� �.���-J��*�,6�)����)�(@��%� � (� 4���9S��t�u�u� u� � !�t�~�'A�T�EV�Eb��j��� � "� #� #� ���(B���.�.�0�0� � ��$�|� ��#�~�:� � � ��{�8�,�-�-��1�1�'2�8�'<�Q�'?�'F�$��E�2��/�/�/� �n���� � $� ��(<� � �N� &�$�*;�*G��j��� �.�0S�TX�T]�Tg�0h�0h�h��)2�):�%�%�@Q�T]�@]� � /� �t�z�4�#8�$�.�$�J[�! �! ��� ���T�Y�-C�b�/�/�Ib�c�c�c�%*��!�!��� �9� � � � � � � � � �9� � =� �K�� �,�t� <� <� <� <���*�+�+� s����AS�Ud�fj�9k�9k�0l�0l� s��q�r�r� r� �>�A� � �$�"7�!�";�";� �K�K�b� c� c� c� � $�Z� �-F�-F� $�4�>�]^�K^�K^��d��� � � %��=�%�+�*:�*J�K�K� &��$� &��s�t�t� t�"&����� � ��� #� $� $� ��y� t� �!r�s�s�s�(� ��9�� � � 2�2�2��N�N�;�3�9�=�CU�;�;�DH�I�;�;�&)�i�m�&8�;�;���� !$� � � 2�D�I���3�9�=�&�1�1���N�N�c�3�9�=�CU�c�c�c���� �I� W��� W��(C�v�(M�(M��{�e�l�5�1�1�1�1��9�<�=�_�(�)]�^�^�^�_�3<�D�/� �K�K�U��!<�U�U�U� V� V� V� �I� %��� %�T�-F� %�Ja�Jc�Jc� %��*�i�7�7�#'�� �!&������,��6�6�(�*�*��%�?����!%�� � �9� +�q� 0� 0�"/�� �8X�"Y�"Y�"Y�D� � �"&�D� �%�'�'�� �!�"&�"<�"<�">�">�"&�"<�"<�">�">� � �!�2�<�� �~�M� � � � � � �� ����!%��� �$�*� %� %� �$�)�*?�*G� �N�N�J�D�J�<P�<Y�J�J�J� � � � *�$�*�*>�?�?��26�)�2G�2O�.�.�UY�U^�Uj���.�t�z�/C�D�D����,�8�8���D�J�PT�P\�]�]�� �"&�!6���%*��"� ��4�4�6�6�6� � � V�&@�&B�&B� V��T�U�U� U�&*�&6�&:�&:�=�%�&P�&P��#� � &� w�)� O� �!M�N�N�N��8�:�:�K� � �r�w�r�x��k�0B�0B�'C�'C�k�ST�EU�bt�u�u�u� v� v� v�&*�&>�&b�t�Gb�Cb��#�#�#r��returnc�D�t�d��|jS)NzUTrainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.�rrCr��r�s r�r�zTrainer.tokenizer.s�����n�o�o�o��$�$r�c�H�t�d��||_dS)NzjTrainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.r�)r�r�s r�r�zTrainer.tokenizer3s+����� x� � � �!1����r�c��|j�|��}t|��r|jj���}n|���}~|j|_|�t��}||_ |S)z� Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper: https://arxiv.org/abs/2310.05914 ) rVr)r�� base_modelr��get_input_embeddingsrZ�register_forward_hookr`�neftune_hook_handle)r�r�r�� embeddings� hook_handles r��_activate_neftunezTrainer._activate_neftune:s��� �*�7�7��>�>�� �/� *� *� @�(�3�9�N�N�P�P�J�J�(�=�=�?�?�J� �)-�)A� �&� �6�6�7P�Q�Q� �#.�� �� r�c�4�t|d��std���|j�|��}t |��r|jj���}n|���}|j� ��|` ~dS)z^ Deactivates the neftune method. Make sure to call `_activate_neftune` first. r�zNNeftune is not activated make sure to call `trainer._activate_neftune()` firstN) rBr"rVr)r�r�r�r�r��removerZ)r�r�r�r�s r��_deactivate_neftunezTrainer._deactivate_neftuneMs����t�2�3�3� o��m�n�n� n��*�7�7��>�>�� �/� *� *� @�(�3�9�N�N�P�P�J�J�(�=�=�?�?�J� � �'�'�)�)�)� � *�O�O�Or�c�:�|j�|��dS)ag Add a callback to the current list of [`~transformers.TrainerCallback`]. Args: callback (`type` or [`~transformers.TrainerCallback]`): A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the first case, will instantiate a member of that class. N)r`ra�r��callbacks r�razTrainer.add_callback^s!�� ��*�*�8�4�4�4�4�4r�c�6�|j�|��S)aK Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it. If the callback is not found, returns `None` (and no error is raised). Args: callback (`type` or [`~transformers.TrainerCallback]`): A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the first case, will pop the first member of that class found in the list of callbacks. Returns: [`~transformers.TrainerCallback`]: The callback removed, if found. )r`� pop_callbackr�s r�r�zTrainer.pop_callbackis���$�1�1�(�;�;�;r�c�:�|j�|��dS)a� Remove a callback from the current list of [`~transformers.TrainerCallback`]. Args: callback (`type` or [`~transformers.TrainerCallback]`): A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the first case, will remove the first member of that class found in the list of callbacks. N)r`�remove_callbackr�s r�r�zTrainer.remove_callbackys!�� ��-�-�h�7�7�7�7�7r�c��|�|��}|jjtjkr&t |d��r|���dSdSdS)N� tie_weights)�tor�rKrf�TPUrBr�)r�r�r�s r�rSzTrainer._move_model_to_device�s[������ � �� �9� "�l�&6� 6� 6�7�5�-�;X�;X� 6� � � � � � � � � 7� 6� 6� 6r�c��|j��|j}t|j��r@t|jd��r|j���}n|jjj}t j|j��}t|j � ����|_|xjttddg|j z����z c_dSdS)Nr��label� label_ids)rqr�r�rBr�r�r�r�rWr�r�r!r=r�)r��model_to_inspectr�s r�� _set_signature_columns_if_neededz(Trainer._set_signature_columns_if_needed�s��� � "� *�#�z� ��d�j�)�)� C��4�:�'7�8�8�C�'+�z�'@�'@�'B�'B�$�$�(,�z�'<�'B�$��)�*:�*B�C�C�I�&*�9�+?�+D�+D�+F�+F�&G�&G�D� #� � #� #�t�C��+�0F��IY�0Y�,Z�,Z�'[�'[� [� #� #� #� #� +� *r��dataset� descriptionc �N��|jjs�S|���|j}t t �j��t |��z ��}t|��dkrx|�dnd|�d�}t� d|�d|j j j �dd� |���d d� |���d |j j j �d � ���fd �|D��}t|��dkr&td d� |���d����tjt"j��tjd��kr0���jd|�jd����S��|��S)Nr�zin the z setzThe following columns z) don't have a corresponding argument in `z!.forward` and have been ignored: �, z. If z are not expected by `z/.forward`, you can safely ignore this message.c�&��g|] }|�jv� |��Sr�)� column_names)r�rr�s �r�r�z2Trainer._remove_unused_columns.<locals>.<listcomp>�s&���M�M�M��1��8L�3L�3L�1�3L�3L�3Lr�zpNo columns in the dataset match the model's forward method signature. The following columns have been ignored: [zp]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.�1.4.0r�� format_kwargs)r��columnsr�)r��remove_unused_columnsr�rqr�r=r�r?rrr�r9r:�joinr"rr��datasetsr� set_format�format�remove_columns)r�r�r��signature_columns�ignored_columns�dset_descriptionr�s ` r��_remove_unused_columnszTrainer._remove_unused_columns�s�����y�.� ��N� �-�-�/�/�/� �3���s�7�#7�8�8�3�?P�;Q�;Q�Q�R�R�� �� � �!� #� #�%0�%8�r�r�>Y� �>Y�>Y�>Y� � �K�K�7�)9�7�7��J�(�1�7�7�TX�T]�T]�^m�Tn�Tn�7�7��y�y��1�1�7�7�IM��I]�If�7�7�7� � � �N�M�M�M�/�M�M�M�� �w�<�<�1� � ��@�=A�Y�Y��=W�=W�@�@�@��� � �=��-� .� .���w�1G�1G� G� G� � � ��^�F�+�W�G�N�[j�Lk� � � � ��N��)�)�/�:�:� :r�c��|jjs|S|���|j}t ||t ||jjj���}|S)z=Wrap the data collator in a callable removing unused columns.)r�r�rr�� model_name) r�r�r�rqrUrr�r9r:)r�r�r�r��remove_columns_collators r��"_get_collator_with_removed_columnsz*Trainer._get_collator_with_removed_columns�sf���y�.� !� � � �-�-�/�/�/� �3��"7�'�/��#��z�+�4� # �# �# ��'�&r�c���|j�t|j��sdS|jjr�t ��rQt |jt j��r2|jj|jj vr|j|jjnd}nd}|j �|j j dnd}t|jj |jjz|j||���St|j��S)Nr�r��lengths�model_input_name)r�r_r�rpryr�r�r�length_column_namer�r��model_input_namesr?r��gradient_accumulation_stepsr)r�r�r�s r��_get_train_samplerzTrainer._get_train_sampler�s��� � � %�Z��8J�-K�-K� %��4� �9� $� 5�$�&�&� �:�d�6H�(�JZ�+[�+[� ��y�3�t�7I�7V�V�V��&�t�y�'C�D�D���� ��>B�>S�>_��%�7��:�:�ei� �(�� �*�T�Y�-R�R��*��!1� ��� �!��!3�4�4� 4r�c�^�|j�td���|j}|j}t��r2t |t j��r|�|d���}n|�|d���}|j ||j j |j j |j j d�}t |tjjj��s?|���|d<|j j|d<t(|d<|j j|d <|j�t1|fi|����S) a@ Returns the training [`~torch.utils.data.DataLoader`]. Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed training if necessary) otherwise. Subclass and override this method if you want to inject some custom behavior. Nz+Trainer: training requires a train_dataset.�training�r��� batch_size� collate_fn� num_workers� pin_memory�persistent_workers�sampler� drop_last�worker_init_fn�prefetch_factor)r�r"r�ryr�r�rr�r�r�r��dataloader_num_workers�dataloader_pin_memory�dataloader_persistent_workersr�rnrorr��dataloader_drop_lastrb�dataloader_prefetch_factorrV�preparer)r�r�r��dataloader_paramss r��get_train_dataloaderzTrainer.get_train_dataloader�s=�� � � %��J�K�K� K��*� ��*� � � "� "� k�z�-��AQ�'R�'R� k� �7�7� �S]�7�^�^�M�M� �C�C�M�_i�C�j�j�M��0�'��9�;��)�9�"&�)�"I�  � ���-���)9�)I�J�J� X�+/�+B�+B�+D�+D� �i� (�-1�Y�-K� �k� *�2=� �.� /�37�9�3W� �/� 0���'�'� �=�(V�(V�DU�(V�(V�W�W�Wr�c���|�t|��sdS|jjr�t��r4t |t j��t j�����St��r?t |tj ��tj ��|jj ���St|��S|jjr�t��rBt!|t"j��r(|jj|jvr||jjnd}nd}|j�|jjdnd}t/|jj|||���S|jjdkrt|��SdS)N)� num_replicas�rank)r�r�r�rr�r )r_r��use_legacy_prediction_loopr�r@�xm�xrt_world_size� get_ordinalr�rv�dp_size�dp_rank�per_device_eval_batch_sizerrpryr�r�rr�r�r�r�r?�eval_batch_size� world_size)r�r�r�r�s r��_get_eval_samplerzTrainer._get_eval_sampler s��� � �z�,�'?�'?� ��4� �9� /� 7�%�'�'� 7�3� �r�/@�/B�/B���IY�IY�����)�*�*� 7�3� �!$��������#�y�C� ����)��6�6�6� �9� $� �$�&�&� �:�l�H�DT�+U�+U� ��y�3�|�7P�P�P�!���!=�>�>���� ��>B�>S�>_��%�7��:�:�ei� �(�� �)�$��!1� ��� � �9� �1� $� $�$�\�2�2� 2��4r�c���|�|j�td���t|t��r|nd}t |d��r:||jvr1|jjr%|j� |j|��St|t��r |j|n |�|n|j}|j }t��r2t|tj ��r|�|d���}n|�|d���}|jj||jj|jj|jjd�}t|t&jjj��s6|�|��|d<|jj|d <|jj|d <t5|fi|��}|jjr$t |d��r ||j|<n ||i|_|j� |��S) a  Returns the evaluation [`~torch.utils.data.DataLoader`]. Subclass and override this method if you want to inject some custom behavior. Args: eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*): If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. Nz-Trainer: evaluation requires an eval_dataset.�eval�_eval_dataloaders� evaluationr�r�r�r�r�)r�r"r��strrBr r�r�rVr�r�ryr�rr�r�rr�r�r�rnrorrr�r�r)r�r��dataloader_keyr�r��eval_dataloaders r��get_eval_dataloaderzTrainer.get_eval_dataloader6s�� � �D�$5�$=��L�M�M� M�*4�L�#�)F�)F�R���F�� �D�-� .� .� T��$�"8�8�8�� �7�9��#�+�+�D�,B�>�,R�S�S� S��,��,�,� #�D� �l� +� +��'����"� ��*� � � "� "� m�z�,��@P�'Q�'Q� m��6�6�|�Q]�6�^�^�L�L� �C�C�M�_k�C�l�l�M��)�3�'��9�;��)�9�"&�)�"I�  � ���,�� �(8�(H�I�I� X�+/�+A�+A�,�+O�+O� �i� (�-1�Y�-K� �k� *�37�9�3W� �/� 0�%�\�G�G�5F�G�G�� �9� 2� K��t�0�1�1� K�9H��&�~�6�6�*8�/�)J��&���'�'��8�8�8r�� test_datasetc��|j}t��r2t|tj��r|�|d���}n|�|d���}|jj||jj |jj |jj d�}t|tj jj��s6|�|��|d<|jj|d<|jj|d<|j�t+|fi|����S)a� Returns the test [`~torch.utils.data.DataLoader`]. Subclass and override this method if you want to inject some custom behavior. Args: test_dataset (`torch.utils.data.Dataset`, *optional*): The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. It must implement `__len__`. �testr�r�r�r�r�)r�ryr�r�rr�r�r�rr�r�r�r�rnrorrr�r�rVr�r)r�rr�r�s r��get_test_dataloaderzTrainer.get_test_dataloaderss���*� � � "� "� g�z�,��@P�'Q�'Q� g��6�6�|�QW�6�X�X�L�L� �C�C�M�_e�C�f�f�M��)�3�'��9�;��)�9�"&�)�"I�  � ���,�� �(8�(H�I�I� X�+/�+A�+A�,�+O�+O� �i� (�-1�Y�-K� �k� *�37�9�3W� �/� 0���'�'� �<�(U�(U�CT�(U�(U�V�V�Vr��num_training_stepsc��|���tr#tjjjr |jj}n|j}|�||���dS)aZ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or `create_scheduler`) in a subclass. )rr[N)�create_optimizerrurvrwrxr r[�create_scheduler�r�rr[s r��create_optimizer_and_schedulerz&Trainer.create_optimizer_and_scheduler�s_�� ������ $� '����);� '���0�I�I���I� ���1C�y��Y�Y�Y�Y�Yr�c�6�t|tgd���}|S)a0 Get all parameter names that weight decay will be applied to. This function filters out parameters in two ways: 1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS) 2. By parameter name patterns (containing 'bias', 'layernorm', or 'rmsnorm') )�bias� layernorm�rmsnorm)rFr/)r�r��decay_parameterss r��get_decay_parameter_namesz!Trainer.get_decay_parameter_names�s$��/�u�6J�Ll�Ll�Ll�m�m���r�c��� �t��r|jn|j}|j�� |�|��� � fd�|���D��|jjd�� fd�|���D��dd�g}|j� |j\}}n|� |j|��\}}d|vr|� d��}d|vr|� d��}d|vr|� d��}||fi|��|_|j d k�rd dl }|j j���}d }|���D]�}t#|t$j��r�|t)d �|���D�������z }t.�d |�d |dz �d���|�|dddi��t.�d|�d�����t.�d|dz �d���t��rt7j|j��|_|jS)a Setup the optimizer. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init through `optimizers`, or subclass and override this method in a subclass. Nc�0��g|]\}}|�v� |j�|��Sr��� requires_grad�r��n�prs �r�r�z,Trainer.create_optimizer.<locals>.<listcomp>�s8������"�a��q�L\�G\�G\�ab�ap�G\��G\�G\�G\r�)r � weight_decayc�0��g|]\}}|�v� |j�|��Sr�r!r#s �r�r�z,Trainer.create_optimizer.<locals>.<listcomp>�s8������"�a��q�P`�G`�G`�ef�et�G`��G`�G`�G`r��r r��optimizer_dict�Adam8bitrc�\�i|])}|���|�����*Sr�)�data_ptr�numel�r�r%s r�� <dictcomp>z,Trainer.create_optimizer.<locals>.<dictcomp>�s*��']�']�']�A�� � � � �a�g�g�i�i�']�']�']r�zskipped �: izM params�weight� optim_bits� zbitsandbytes: will optimize z in fp32z skipped: )r�rUr�r[r�named_parametersr�r&r��get_optimizer_cls_and_kwargs�popr:� bitsandbytes�optim�GlobalOptimManager� get_instance�modulesr�r� Embedding�sumr�r>rr�register_module_override�debugrv�DistributedOptimizer) r�� opt_model�optimizer_grouped_parameters� optimizer_cls�optimizer_kwargsr7�manager�skipped�modulers @r�rzTrainer.create_optimizer�s����+B�*C�*C�S�D�&�&��� � �>� !�#�=�=�i�H�H� �����&/�&@�&@�&B�&B����%)�I�$:� ������&/�&@�&@�&B�&B����%(� �� ,� (��,�8�26�2O�/� �/�/�26�2S�2S�TX�T]�_h�2i�2i�/� �/��+�+�+�/?�/C�/C�H�/M�/M�,��*�*�*�/?�/C�/C�G�/L�/L�,� �#3�3�3�/?�/C�/C�DT�/U�/U�,�*�]�+G�\�\�K[�\�\�D�N��%��3�3�#�#�#�#�&�,�?�L�L�N�N����'�/�/�1�1�V�V�F�!�&�"�,�7�7�V��3�']�']��IZ�IZ�I\�I\�']�']�']�'d�'d�'f�'f�#g�#g�g��� � �$R�v�$R�$R��5��$R�$R�$R�S�S�S��8�8���L�Z\�K]�^�^�^�� � �%T�F�%T�%T�%T�U�U�U��� � �A��%��A�A�A�B�B�B� "� $� $� F� �5�d�n�E�E�D�N��~�r�c�b�td�|j���D����S)z9 Get the number of trainable parameters. c3�LK�|]}|j� |���V�� dSr)r"r-r.s r�rz7Trainer.get_num_trainable_parameters.<locals>.<genexpr>�s1����Q�Q����Q�1�7�7�9�9�Q�Q�Q�Q�Q�Qr�)r=r�r�r�s r��get_num_trainable_parametersz$Trainer.get_num_trainable_parameters�s/���Q�Q�d�j�&;�&;�&=�&=�Q�Q�Q�Q�Q�Qr�c�Z�|j�td���d�|jjD��S)zR Returns the learning rate of each parameter from self.optimizer. N�PTrainer optimizer is None, please make sure you have setup the optimizer before.c��g|] }|d�� S)�lrr��r��groups r�r�z.Trainer.get_learning_rates.<locals>.<listcomp>�s��E�E�E���d� �E�E�Er��r[r"r]r�s r��get_learning_rateszTrainer.get_learning_rates�s4�� �>� !��o�p�p� p�E�E���)D�E�E�E�Er�r�c��|j�td���|�|jjD]}||dvr|cS�d�|jjD��S)a Returns optimizer group for a parameter if given, else returns all optimizer groups for params. Args: param (`str` or `torch.nn.parameter.Parameter`, *optional*): The parameter for which optimizer group needs to be returned. NrLr c��g|] }|d�� S)r r�rOs r�r�z/Trainer.get_optimizer_group.<locals>.<listcomp>s��I�I�I�E��h��I�I�Ir�rQ)r�r�rPs r��get_optimizer_groupzTrainer.get_optimizer_groupsk�� �>� !��o�p�p� p� � ���4� !� !���E�(�O�+�+� �L�L�L�,�I�I�T�^�-H�I�I�I�Ir�c�d���#�$�i�#�jrN�j�dd���d��D]}|�d��\}}|�#|<� d�ji�$�j�jf�jd�} d{dtd tttfd tttfd td tttff ���#�$fd � }�j tjkr!t}�$�ddd���� n�j tjtjfvrKddlm}|}�$�|���j tjkr�$�ddi��� n��j tjkr= ddlm}|}�$�|��� nz#t.$rt1d���wxYw�j tjkr= ddlm} | }�$�|��� n(#t.$rt1d���wxYw�j tjkr= ddlm} | }�$�|��� n�#t.$rt1d���wxYw�j tjtj tj!tj"tj#tj$tj%tj&tj'tj(tj)tj*tj+tj,tj-fv�r� ddl.m}m/} m0} d} d}d}|}d�j vrd} d�j vrd}d�j vr|}�n�d�j vr| }d �j�jfi}�nzd!�j vr| }�#}�nkd"�j v�ratc��rVtej3thj5�2d#����tej3d$��krt1d%���dd&l.m6}|}to�#�8d'�j����to�#�8d(�j����to�#�8d)d*����fto�#�8d+d,����to�#�8d-�j����d.�}d/�#vrts�#d/��|d/<d0�#vrts�#d0��|d0<d1|i}d!�j vr| |d2<�$�|���$�|��n#t.$rt1d3���wxYwtc��ratej3thj5�2d#����tej3d4��krtt�;d5���nr�j tj<kr� dd6l=m>}|}�$�|���$�t�#�8d7d8����t�t��#�8d9d:����t�t��#�8d;d:����t�t��#�8d<d=����d>����n�#t.$rt1d?���wxYw�j tjBkrt�j jB}�nH�j tjCkrt�j jD}�n �j tjEkrt�j j0}�n��j tjFtjGtjHtjItjJtjKfv�r3t���st/d@���ddAlMmN}mO}mP}tjF|tjG|tjH|tjI|tjJ|tjK|i}ts�#�QdBdC����ts�#�QdDdE����to�#�QdFdG�����#�QdHdI��dJ�}|�j ||��\}�$�j tjHkr�$�ddd����ny�j tjRtjSfvr�t���st/dK���ddLlUmV}tjR|tjS|i}ts�#�QdBdC�����#�QdMdN���#�QdOdP��ts�#�QdDdE����to�#�QdFdQ�����#�QdHdI��dR�}|�j ||��\}�$�n^�j tjWtjXfvr{t���st/dS���t�dT��st/dU�����t1dV���ddWl[m\}m]}dX�j vr|}n|}�$�dY�i���nĉj tj^kr�t���st1dZ���dd[l`ma}|}�$�to�#�8d\d]����to�#�8d^d_����to�#�8d`da����to�#�8dbda����to�#�8dcdQ����dd����nȉj tjbtjcfv�rt���rGtej3thj5�2de����tej3df��krt/dg���tej3thj5�2dh����tej3di��krt/dj���ddklemf}mg}�j tjbkr|}n'�j tjckr|}nt1dl����$�|���n��j tjhtjitjjfv�rNt���st/dm���t�dT��st/dn���ddollmm}mn} i}d}!�j tjhkr+t�dp��st/dq���ddrllmo}"|"}|}d}!nA�j tjikr|}|}n'�j tjjkr| }nt1ds����jp|dt<|!r �jq|du<|�to�#�8dvd_����to�#�8dwdx����dy����$�|��nt1dz�j �����|�$fS)|z� Returns the optimizer class and optimizer parameters based on the training arguments. Args: args (`transformers.training_args.TrainingArguments`): The training arguments for the training session. � r��,�=rN)�betas�epsT�optimizer_name�optimizer_mapping� optim_kwargs�is_layerwise_supportedr�c�����|����d��}|r*�jtjkr|rt d|�d����||}�j�td|�d����t�jttf��std�j�������td|�d ����t�jt��o�j� d d ��d k}g}g��� ��D]�\}} t�j|d ���\} } t| tj��s%| r"| s t �|�d|�d����^| s|s�c|�| j����|dz����t)|��dkrtd|�d�j�d�����fd�����D��} |����d| id|i|�g} |r��jdkrtd|�d����i�| D]}|d|gigfi����|<�|D]}|d|gi|�gfi����|<��fd�}����D]}|jr|�|���t6}��d�i����d| i��|�fS)a Helper function to set up low-rank optimizers like GaLore and Apollo. Args: optimizer_name (str): Name of the optimizer. optimizer_mapping (dict): Mapping of optimizer names to their classes. optim_kwargs (dict): Keyword arguments for the optimizer. is_layerwise_supported (bool): Whether layerwise optimization is supported. Returns: Tuple[Any, Any]: Optimizer class and updated optimizer kwargs. � layerwisez Layer-wise z" does not support DDP at this timeNz1You need to define `optim_target_modules` to use z optimizerszX`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: z'You need to pass a model to initialize z optimizer.�_�-z all-linearT)�return_is_regexz matched but ignored. z only supports linear layers.z.weightrzNo target modules found for z (�).c�"��g|] \}}|�v� |�� Sr�r�)r�r$r%�target_params_namess �r�r�zZTrainer.get_optimizer_cls_and_kwargs.<locals>.setup_low_rank_optimizer.<locals>.<listcomp>ks(��� h� h� h�t�q�!�1�Tg�Kg�Kg��Kg�Kg�Kgr�r r z Layerwise z( does not support gradient accumulation!c���|j�6�|����|���dSdSr)�grad�step� zero_grad)r�r)s �r��optimizer_hookz^Trainer.get_optimizer_cls_and_kwargs.<locals>.setup_low_rank_optimizer.<locals>.optimizer_hook}sG����z�-�&�u�-�2�2�4�4�4�&�u�-�7�7�9�9�9�9�9�.�-r�r))�lower�endswithrKrfrL�NotImplementedError�optim_target_modulesr"r�r�r �replace� named_modulesrYr�LinearrrC�appendr1r?r4�updater�r�r"�"register_post_accumulate_grad_hookr>)r\r]r^r_� is_layerwiserC� all_linear� target_params� module_namerG�target_module_exists�is_regex�non_target_paramsr]r�rlr)rgr�r�� optim_argsrDs @@����r��setup_low_rank_optimizerzFTrainer.get_optimizer_cls_and_kwargs.<locals>.setup_low_rank_optimizer,s������$*�/�/�1�1�:�:�;�G�G�L�� l�� 2�l�6N� N� N�Si� N�)�*j��*j�*j�*j�k�k�k�-�n�=�M��(�0� �!p�Uc�!p�!p�!p�q�q�q��d�7�$���E�E� � �K�os�pI�K�K�����}� �!f�>�!f�!f�!f�g�g�g��4�4�c�:�:�P��-�5�5�c�3�?�?�<�O� � �M�"$� �',�':�':�'<�'<� D� D�#� �V�1K��-�{�D�2�2�2�.�$�h�"�&�"�)�4�4��+��H�����*�o�o�.�o�o�o�����+��J����$�$�V�]�3�3�3�#�*�*�;��+B�C�C�C�C��=�!�!�Q�&�&� �!o��!o�!o�RV�Rk�!o�!o�!o�p�p�p� h� h� h� h�u�/E�/E�/G�/G� h� h� h� � � � � � +� +� +��,�-��=�9�L�9��L� � L��3�q�8�8�$�%j�.�%j�%j�%j�k�k�k�!#��.�e�e�E�,9�M�H�u�g�;N�:O�,d�,d�Sc�,d�,d�N�5�)�)�*�u�u�E�,9�M�H�u�g�;^�Q]�;^�:_�,t�,t�cs�,t�,t�N�5�)�)�:�:�:�:�:� #�-�-�/�/�Q�Q�E��*�Q��@�@��P�P�P�� 7� � �'�'�)9�>�(J�K�K�K� � #� #�X�|�$<� =� =� =� �"2�2� 2r�F)�scale_parameter� relative_stepr)�AdamW�fusedz7Trainer failed to import syncfree AdamW from torch_xla.)� NpuFusedAdamWz3Trainer failed to import FusedAdamW from torch_npu.)� FusedAdamzFTrainer tried to instantiate apex FusedAdam but apex is not installed!)r��Lion�RMSpropr3N�paged�8bit��adam�lionrZ�rmsprop�ademamixr7z0.44.0z{The AdEMAMix optimizer is not supported by your current version of `bitsandbytes`. Please install `bitsandbytes` >= 0.44.0.)�AdEMAMix�beta1�beta2�beta3g��H.��?�alphag@r[)rZr�r[�t_alpha�t_beta3r2�is_pagedzOTrainer tried to instantiate bnb optimizer but `bitsandbytes` is not installed!z0.41.1z�You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.)�AnyPrecisionAdamW�use_kahan_summation�False�momentum_dtype�float32�variance_dtype�compensation_buffer_dtyperz)r�r�r�r�z4Please install https://github.com/pytorch/torchdistxz�You need to install `galore_torch` in order to use GaLore optimizers install it with `pip install git+https://github.com/jiaweizzhao/GaLore`)�GaLoreAdafactor� GaLoreAdamW�GaLoreAdamW8bitr���update_proj_gap���scaleg�?� proj_type�std)r�r�r�r�z�You need to install `apollo_torch` in order to use APOLLO optimizers install it with `pip install git+https://github.com/zhuhanqing/APOLLO`)� APOLLOAdamW�proj�random� scale_type�channelg�?)r�r�r�r�r�r�ziYou need to install `lomo_optim` in order to use LOMO optimizers install it with `pip install lomo-optim`�0.30.0zGYou need to have `accelerate>=0.30.0` to be able to use LOMO optimizerszMYou need to pass a `model` in order to correctly initialize a LOMO optimizer.)�AdaLomo�Lomo�adar�z5Please install grokadamw with `pip install grokadamw`)� GrokAdamW� alpha_initg\���(\�?�lambg@�gammag�������?�grokking_signal_decay_rate�gradient_clipping)r�r�r�r�r��torchaoz0.4.0z�You need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers.Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/aor�z2.4z�You need to have `torch>2.4` in order to use torch 4-bit optimizers. Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly.)� AdamW4bit� AdamW8bitzInvalid optimizerzwYou need to install `schedulefree` in order to use schedulefree optimizers. Install it with `pip install schedulefree.`zOYou need to have `accelerate>=0.30.0` to be able to use schedulefree optimizers)�AdamWScheduleFree�SGDScheduleFreer�z�You need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. Install it with `pip install schedulefree.`)�RAdamScheduleFreezInvalid schedulefree optimizerr&� warmup_steps�weight_lr_power�rr()r�r�z2Trainer cannot instantiate unsupported optimizer: �T)rr~rq�split� learning_rate� adam_beta1� adam_beta2� adam_epsilonr �dictr�bool�tupler8re� ADAFACTORr,ru� ADAMW_TORCH�ADAMW_TORCH_FUSED� torch.optimr��ADAMW_TORCH_XLA�torch_xla.amp.syncfreerDr"�ADAMW_TORCH_NPU_FUSED�torch_npu.optimr��ADAMW_APEX_FUSED�apex.optimizersr�� ADAMW_BNB� ADAMW_8BIT� PAGED_ADAMW�PAGED_ADAMW_8BIT�ADEMAMIX� ADEMAMIX_8BIT�PAGED_ADEMAMIX�PAGED_ADEMAMIX_8BIT�LION� LION_8BIT� PAGED_LION�PAGED_LION_8BIT� RMSPROP_BNB� RMSPROP_8BIT� RMSPROP_32BIT�bitsandbytes.optimr�r�rxrr�r�r�r��floatr��intrrC�ADAMW_ANYPRECISION�torchdistx.optimizersr�r�r;r��SGD�ADAGRAD�Adagrad�RMSPROP� GALORE_ADAMW�GALORE_ADAMW_8BIT�GALORE_ADAFACTOR�GALORE_ADAMW_LAYERWISE�GALORE_ADAMW_8BIT_LAYERWISE�GALORE_ADAFACTOR_LAYERWISErz� galore_torchr�r�r�r6� APOLLO_ADAMW�APOLLO_ADAMW_LAYERWISErw� apollo_torchr��LOMO�ADALOMOrru� lomo_optimr�r�� GROKADAMWr{� grokadamwr��ADAMW_TORCH_4BIT�ADAMW_TORCH_8BITr��torchao.prototype.low_bit_optimr�r��SCHEDULE_FREE_RADAM�SCHEDULE_FREE_ADAMW�SCHEDULE_FREE_SGDr�� schedulefreer�r�r�r&r�)%r�r��mapping�key�value� adam_kwargsrrCr�r�r�r�r�r�r2�additional_optim_kwargsr�� bnb_kwargsr�r�r�r�r]�galore_optim_kwargsr��apollo_optim_kwargsr�r�r�r�r�r�r��require_warmupr�r~rDs%`` @@r�r5z$Trainer.get_optimizer_cls_and_kwargss{������� � �?� (��?�2�2�3��;�;�A�A�#�F�F� (� (��$�]�]�3�/�/� ��U�"'� �3��� �$�"4�5���o�t��7��$� � � �,0� ^ 3�^ 3��^ 3�#�C��H�~�^ 3��s�C�x�.�^ 3�%)� ^ 3� �3��8�_� ^ 3�^ 3�^ 3�^ 3�^ 3�^ 3�^ 3�^ 3�^ 3�@ �:��1� 1� 1�%�M� � #� #��PU�$V�$V� W� W� W� W� �Z�N�6��8X�Y� Y� Y� )� )� )� )� )� )�!�M� � #� #�K� 0� 0� 0��z�^�=�=�=� �'�'��$��8�8�8�� �Z�>�9� 9� 9� \�8�8�8�8�8�8� %� � �'�'� �4�4�4�4��� \� \� \� �!Z�[�[�[� \���� �Z�>�?� ?� ?� X�9�9�9�9�9�9� -� � �'�'� �4�4�4�4��� X� X� X� �!V�W�W�W� X���� �Z�>�:� :� :� k�5�5�5�5�5�5� )� � �'�'� �4�4�4�4��� k� k� k� �!i�j�j�j� k���� �Z� � $� � %� � &� � +� � #� � (� � )� � .� � � � $� � %� � *� � &� � '� � (� � � �"8 t�C�C�C�C�C�C�C�C�C�C� ��� � $� �*5�'��d�j�(�(�#�H��T�Z�'�'�!"�J��T�Z�'�'�$)�M�M��t�z�)�)�$(�M�/6���$�/�8Z�.[�+�+��$�*�,�,�$+�M�/9�+�+��4�:�-�-�0�2�2��w�}�!�*�2�2�>�B�B�8�8�� �h�/�/�80�80�)�G���� <�;�;�;�;�;�$,�M�"�*�.�.��$�/�"J�"J�K�K�!�*�.�.��$�/�"J�"J�K�K�!�*�.�.��&�"A�"A�B�B�"� "'�z�~�~�g�s�'C�'C�!D�!D�$�Z�^�^�E�4�;L�%M�%M�N�N�/�/�+�!�J�.�.�=@��I�AV�=W�=W�/� �:� �J�.�.�=@��I�AV�=W�=W�/� �:�*�J�7� ��D�J�.�.�-5�J�z�*� �'�'�(?�@�@�@� �'�'� �3�3�3�3��� t� t� t� �!r�s�s�s� t����(�*�*� �w�}��"�*�*�>�:�:�0�0�� �h�'�'�0(�0(����r������Z�>�<� <� <� Y�C�C�C�C�C�C� 1� � �'�'� �4�4�4�!�'�'�/8����H]�_f�9g�9g�/h�/h�*1�%����HX�Zc�9d�9d�*e�*e�*1�%����HX�Zc�9d�9d�*e�*e�5<�!�:�>�>�2M�z�#Z�#Z�6�6� �� � � � ��� Y� Y� Y� �!W�X�X�X� Y���� �Z�>�-� -� -�!�K�O�M�M� �Z�>�1� 1� 1�!�K�/�M�M� �Z�>�1� 1� 1�!�K�/�M�M� �Z� � '� � ,� � +� � 1� � 6� � 5�  � � �-�.�.� �!�_���� S� R� R� R� R� R� R� R� R� R��+�[��0�/��/���5�{��:�O��9�?� !� ��J�N�N�6�3�7�7�8�8�#&�z�~�~�6G��'M�'M�#N�#N��z�~�~�g�t�<�<�=�=�'�^�^�K��?�?� #�#� �/G�.F�� �-�/B�/�/� +�M�+��z�^�<�<�<� �'�'�E�TY�(Z�(Z�[�[�[�� �Z� � '� � 1� � � �-�.�.� �!�^���� 1� 0� 0� 0� 0� 0��+�[��5�{�!� � �J�N�N�6�3�7�7�8�8�"���v�x�8�8�(�n�n�\�9�E�E�#&�z�~�~�6G��'M�'M�#N�#N��z�~�~�g�s�;�;�<�<�'�^�^�K��?�?� #�#� �/G�.F�� �-�/B�/�/� +�M�+�+��Z�N�/��1G�H� H� H�$�&�&� �!�@����+�8�4�4� m�!�"k�l�l�l��}� �!p�q�q�q� 0� 0� 0� 0� 0� 0� 0� 0��� �"�"� '� � � $� � � #� #�W�e�$4� 5� 5� 5� 5� �Z�>�3� 3� 3�)�+�+� Z� �!X�Y�Y�Y� +� +� +� +� +� +�%�M� � #� #�"'� ���|�T�(J�(J�"K�"K�!�*�.�.���"=�"=�>�>�"�:�>�>�'�3�#?�#?�@�@�27� ���Gc�eh�8i�8i�2j�2j�).�z�~�~�>Q�SV�/W�/W�)X�)X� �� � � � ��Z� � +� � +� � � �(�)�)� �W�]�9�;M�;U�;U�V_�;`�;`�-a�-a�dk�dq��e�e�.�.�"�{�����}�Y�/�7�7��@�@�A�A�W�]�SX�EY�EY�Y�Y�!�K���� M� L� L� L� L� L� L� L��z�^�<�<�<� )� � ���~�>�>�>� )� � � �!4�5�5�5� � #� #�K� 0� 0� 0� 0� �Z� � .� � .� � ,� � � � -�.�.� �!�B����+�8�4�4� u�!�"s�t�t�t� G� G� G� G� G� G� G� G�&(� #�!�N��z�^�?�?�?�0��9�9��%�F����;�:�:�:�:�:� 1� �*5�'�!&�����~�A�A�A� 1� �*5�'�'���~�?�?�?� /� � � �!A�B�B�B�6:�6G� #�N� 3�� L�:>�:K�'��7� #� *� *�',�Z�^�^�<M�s�-S�-S�'T�'T��z�~�~�c�3�7�7�8�8��� � � � � #� #�$;� <� <� <� <��^�RV�R\�^�^�_�_� _��.�.�.s>�F4�4G�'H�H �9I�I2�%HT.�.U�C Z!�!Z;r[c���|j�Ut|jj|�|jn||j�|��||jj���|_d|_|jS)z� Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or passed as an argument. Args: num_training_steps (int): The number of training steps to do. N)r[�num_warmup_stepsr�scheduler_specific_kwargsT)r\r-r��lr_scheduler_typer[�get_warmup_steps�lr_scheduler_kwargsr�rs r�rzTrainer.create_scheduler�sl�� � � $� -�� �+�,5�,=�$�.�.�9�!%��!;�!;�<N�!O�!O�#5�*.�)�*G� !�!�!�D� �*.�D� &�� � r�� dataloaderc�� |j}t|t��rt|jj��St|j��S#tt t f$rt|��|jjzcYSwxYw)z� Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When dataloader.dataset does not exist or has no length, estimates as best it can ) r�r�r<r?� NameError�AttributeError� TypeErrorr��per_device_train_batch_size)r�rr�s r�� num_exampleszTrainer.num_examples�s���  K� �(�G��'�#7�8�8� 7��:�-�5�6�6�6��z�)�*�*� *���>�9�5� K� K� K��z�?�?�T�Y�%J�J� J� J� J� K���s�4A �A � 3B�B�train_dlrlc��d} |D]*}|d���}|�||zcS||z }�+n*#t$rt�d��YnwxYw|S)zq Helper to get number of tokens in a [`~torch.utils.data.DataLoader`] by enumerating dataloader. r� input_idsNz%Cannot get num_tokens from dataloader)r-�KeyErrorrrC)rrl� train_tokens�batch�tokenss r�� num_tokenszTrainer.num_tokens�s��� � � D�!� '� '���{�+�1�1�3�3���(�!�I�-�-�-�-���&� � �  '�� � D� D� D� �N�N�B� C� C� C� C� C� D�����s�&2�2�$A�A�trialz optuna.Trialc��||_|j�|�dS|jtjkr|�|��}n~|jtjkr|}|�dd��nP|jtjkr$d�|j� ��D��}n|jtj kr|}|� ��D]\}}t|j |��st�d|�d����9t|j |d��}|�t!|��|��}t#|j ||����|jtjkr"t�d|j����|jtjkr"t�d|j����|jtj krt�d|����|jr�|j j�t-d ���|j���d d lm}d d lm}||j j��|j _|j j�|j ��||j j� ��|j _tA���!��|�"��dS)zHP search setup codeN�wandbc�b�i|],\}}|t|t��rt|��n|��-Sr�)r�r r�)r�r�vs r�r/z,Trainer._hp_search_setup.<locals>.<dictcomp>s7��c�c�c���A�a�:�a��#5�#5�<��Q����1�c�c�cr�zTrying to set zY in the hyperparameter search but there is no corresponding field in `TrainingArguments`.zTrial: zSigOpt Assignments: zW&B Sweep parameters: z7For sweeps with deepspeed, `args.deepspeed` must be setr)�DeepSpeedPlugin��HfTrainerDeepSpeedConfig)� hf_ds_config)#�_trialr�rR�OPTUNA�hp_space�RAYr6�SIGOPT� assignments�items�WANDBrBr�rrCr;r��setattrrr rJr+r"rV� free_memory�accelerate.utilsr#�#transformers.integrations.deepspeedr%�hf_deepspeed_config�trainer_config_process�deepspeed_pluginr�� _reset_stater-)r�rr rr�old_attrr#r%s r��_hp_search_setupzTrainer._hp_search_setup�s����� � � !� )�U�]� �F� � !�_�%;� ;� ;��]�]�5�)�)�F�F� � #��':� :� :��F� �J�J�w�� %� %� %� %� � #��'=� =� =�c�c��IZ�I`�I`�Ib�Ib�c�c�c�F�F� � #��'<� <� <��F� �,�,�.�.� +� +�J�C���4�9�c�*�*� ����,�S�,�,�,������t�y�#�t�4�4�H��#�&��X���u�-�-�� �D�I�s�E� *� *� *� *� � !�_�%;� ;� ;� �K�K�0�%�,�0�0� 1� 1� 1� � !�_�%;� ;� ;� �K�K�B�u�/@�B�B� C� C� C� � !�_�%:� :� :� �K�K�8��8�8� 9� 9� 9� � $� .��y�"�*� �!Z�[�[�[� � � (� (� *� *� *� 9� 8� 8� 8� 8� 8� T� T� T� T� T� T�,D�,D�T�Y�EX�,Y�,Y�D�I� )� �I� )� @� @��� K� K� K�)8��d�i�Fc�)d�)d�)d�D�I� &� � � � +� +� -� -� -� �/�/�1�1�1�1�1r�rj�metricsc�:�|j�|�dS|���}|�|��|_|jtjkr�ddl}t|d��r�|j� ��sk|� |j|��|� ��r>|j � |j|j|j��|j���dSdSdS|jtjkr�ddl}t)j��5}d}|jjr5|�|���|jj�|��}|j|d<|j� ||���ddd��dS#1swxYwYdSdS)Nr�study)�checkpoint_dir� objective)� checkpoint)r��copy�compute_objectiver=rRr(�optunarBr;�_is_multi_objective�report� should_pruner`� on_train_endr�rwr~� TrialPrunedr*� ray.train�tempfile�TemporaryDirectoryrh�_tune_save_checkpoint�train� Checkpoint�from_directory)r�rrjr9rA�ray�temp_checkpoint_dirr>s r��_report_to_hp_searchzTrainer._report_to_hp_search1s�� � !� )�U�]� �F��,�,�.�.���/�/��8�8��� � !�_�%;� ;� ;� �M�M�M��u�g�&�&� /�u�{�/N�/N�/P�/P� /�� � �T�^�T�2�2�2��%�%�'�'�/��)�6�6�t�y�$�*�d�l�[�[�[�,�&�,�.�.�.�  /� /� /� /�/�/�� #��':� :� :� � � � ��,�.�.� A�2E�!� ��<�+�Z��.�.�>Q�.�R�R�R�!$��!5�!D�!D�EX�!Y�!Y�J�'+�~�� �$�� � � ��Z� �@�@�@�  A� A� A� A� A� A� A� A� A� A� A� A���� A� A� A� A� A� A�;� :s�A*F�F�Fr<c��tj�|t�d|jj����}|�|d���|jj�r|j ���|jj d<|j� tj�|t����tj|j���tj�|t"����tj|j���tj�|t&����dSdS)NrcT��_internal_callr8)ri�pathr�rNrw� global_step� save_modelr�rhr~r� save_to_json�TRAINER_STATE_NAMEr��saver[� state_dict�OPTIMIZER_NAMEr\�SCHEDULER_NAME)r�r<r�s r�rJzTrainer._tune_save_checkpointIs���W�\�\�.�5J�2e�2e�T�Z�Mc�2e�2e�f�f� � ��� �4��8�8�8� �9� � a�>B�l�>P�>P�>R�>R�D�J� )�*:� ;� �J� #� #�B�G�L�L��=O�$P�$P� Q� Q� Q� �J�t�~�0�0�2�2�B�G�L�L��^�4\�4\� ]� ]� ]� �J�t�(�3�3�5�5�r�w�|�|�J�P^�7_�7_� `� `� `� `� `�  a� ar�c���t|j��}|dkr|���}n+|dkr|�|��}ntd���|�td���|S)Nrr z'model_init should have 0 or 1 argument.z"model_init should not return None.)rar�r5)r�r�model_init_argcountr�s r�r4zTrainer.call_model_initSst��1�$�/�B�B�� �!� #� #��O�O�%�%�E�E� �A� %� %��O�O�E�*�*�E�E��H�I�I� I� �=��C�D�D� D�� r�Fc�� �|�s�|�t�d��|Stt|����� |�� ��� t j|��}|���|j�dd��}|r||_ td���}|j � |���5tj��5tjtjtj��j��tjd��krgt'� t(��r#tj�|� d���}n�tj�|� fd�� D��d���}ngg}� D]1}tj� |��} |�| ���2t3|��}tj�||d� ��}ddd��n #1swxYwYddd��n #1swxYwYtj�|��}tj��5|d i� ��|d i� ��ddd��n #1swxYwY|}d|_nN#t8t:t<t>t@f$r(} t�d | �d ���Yd} ~ nd} ~ wwxYw|S) NzAfailed to use PyTorch jit mode due to current dataloader is none.�_original_forwardF)� cache_enabled)�autocast_handlerr�)�example_kwarg_inputs�strictc�"��i|] }|�|�� Sr�r�)r�r� example_batchs �r�r/z0Trainer.torch_jit_model_eval.<locals>.<dictcomp>ws!���5g�5g�5g�RU�c�=��;M�5g�5g�5gr��rdz'failed to use PyTorch jit mode due to: �.r�)!rrC�next�iter�_prepare_inputsr?r�__dict__r6rWr�rV�autocastr��no_gradrr�r� base_versionr�r��jit�trace� ones_likertr��freezersr5rr"r� IndexError) r�r�rr�� jit_model�original_forwardrb� jit_inputsr�example_tensor�erfs @r��torch_jit_model_evalzTrainer.torch_jit_model_evalas�����& O��!����b�c�c�c�� � ��j�!1�!1�2�2�M� �0�0��?�?�M� O� �I�e�,�,� ���� � � �#,�#5�#9�#9�:M�t�#T�#T� �#�9�(8�I�%�#1��#F�#F�#F� ��%�.�.�@P�.�Q�Q�Y�Y�SX�S`�Sb�Sb�Y�Y��}�W�]�5�3D�%E�%E�%R�S�S�W^�Wd�el�Wm�Wm�m�m�%�m�T�:�:��(-� ��� �Xe�ns��(t�(t�I�I�(-� ��� )�5g�5g�5g�5g�Yf�5g�5g�5g�',�)8�)�)�I�I� &(� �#0�>�>�C�-2�_�]�3�=O�-P�-P�N�&�-�-�n�=�=�=�=�%*�:�%6�%6� �$)�I�O�O�I�z�RW�O�$X�$X� �!Y�Y�Y�Y�Y�Y�Y�Y�Y�Y�Y����Y�Y�Y�Y�Y�Y�Y�Y�Y�Y�Y�Y�Y�Y�Y����Y�Y�Y�Y�""�I�,�,�Y�7�7� ��]�_�_�/�/��I�.�.� �.�.�.��I�.�.� �.�.�.�/�/�/�/�/�/�/�/�/�/�/����/�/�/�/�"��#(�� � �� �)�Z��J�O� O� O� O����M��M�M�M�N�N�N�N�N�N�N�N����� O����� s��A7J � H� DH�: H�H � H� H �H� J �H!�!J �$H!�%5J �I7�+ J �7I;�;J �>I;�? J � #K�/K�Kc�|�t��std���ddl}|sV|���|js|jjr tjn|}|� ||dd|j ���}nB|j s|� ��|� |||j dd���\}|_ |S)Nz�Using IPEX but IPEX is not installed or IPEX's version does not match current PyTorch, please refer to https://github.com/intel/intel-extension-for-pytorch.r�O1F)r��level�conv_bn_folding�inplaceT)r�r[rr}) r}rD�intel_extension_for_pytorchrr,r�rOr�rz�optimizer�rKr[)r�r�r�r��ipexs r��ipex_optimize_modelzTrainer.ipex_optimize_model�s��� �"�"� ��L��� � 3�2�2�2�� � �J�J�L�L�L�*.�*:�b�t�y�?W�b�E�N�N�]b�E��M�M�%�u�D�RW�ei�eu�au�M�v�v�E�E��>� �� � � � � �$(�M�M��U�d�n�d�RV�%2�%�%� !�E�4�>�� r�c �t�dddd�}d}d}|���D]B\}}t||d��}t||d��} |�| �|| kr|d|�d|�d | �d �z }d }�C|j} |jt d |j��z} | | kr|d | �d | �d �z }d }|rt �|��dSdS)N� logging_steps� eval_steps� save_steps)r�r�r�FztWarning: The following arguments do not match the ones in the `trainer_state.json` within the checkpoint directory: z r0z (from args) != z (from trainer_state.json)Tr z per_device_train_batch_size: )r-r;rr��max�n_gpur� warning_once) r�� training_args� trainer_state�attributes_map� has_warning� warning_str�arg_attr� state_attr� arg_value� state_value� train_bs_args�train_bs_states r��#compare_trainer_and_checkpoint_argsz+Trainer.compare_trainer_and_checkpoint_args�s>��,�&�&� � �� � �M� �$2�$8�$8�$:�$:� #� #� �H�j�� �x��>�>�I�!�-��T�B�B�K��$��)@�Y�R]�E]�E]��t�h�t�t�)�t�t�[�t�t�t�t� �"� ��&�A� �&�7�3�q�-�BU�;V�;V�V�� �N� *� *� �I�}�I�I�^l�I�I�I� I�K��K� � -� � � � � ,� ,� ,� ,� ,� -� -r�c�������jjr7�jr tjn tj}��|||���}t��rKt�j tj j ��r�j Stj |�jj ���S�j�|��|ur|S�jr0|r.t#j|�j�jj���\}�_�jjdkr%t-|dd��st/j|��}�jjrSt5j��}��|||��}t9t5j��|z d���_|s|S�j�r� dd lm �dd lm!�dd l"m#}m$}�j%rdd l&m'�n#tP$rtQd ���wxYwd}d} t-|dd��} �jj)�*d| ��} �jj)ddkr'tWj,|�jj)d���}na| �_t[��} | D]8} t]|| ��}|�t_d���| �0|���9tWj,|| ���}�jj1}�jj)dr:|j2j3r&th�5d��d|j2_3����fd�} �j%rd�}�|||| ���x�_ }n�|f|| d�|��x�_ }difd�}|tl_7�n tq��rBt.j9�:|twtyj=d����g���}n��jj>t~j@kr�t���r|Si}�jjB��jjB|d<n&t|t���r |jD |d<nd|d<�jjE��jjE|d<�jjF��jjF|d <t�d!i|���j_H|S)"N)r�)�backward_passes_per_step)� opt_levelr �is_loaded_in_8bitF�r)�XlaFullyShardedDataParallel)�checkpoint_module)�size_based_auto_wrap_policy�transformer_auto_wrap_policy)�SpmdFullyShardedDataParallelzJMissing XLA FSDP related module; please make sure to use torch-xla >= 2.0.�_no_split_modules�transformer_layer_cls_to_wrap�min_num_params)r�z@Could not find the transformer layer class to wrap in the model.)�transformer_layer_cls�xla_fsdp_grad_ckptzX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.c�D���js�n�}|�|��g|�Ri|��Sr)r�)�mr��kwargs� target_cls�FSDP�FSDPv2r�r�s ����r��auto_wrapper_callablez2Trainer._wrap_model.<locals>.auto_wrapper_callables@���-1�-H�!T���f�J�%�:�&7�&7��&:�&:�L�T�L�L�L�V�L�L�Lr�c��ddlm}d}t|tj��r|}n5t|t ��r |d}nt||��r|j}|�td���tj ||d��dS)Nr )�CausalLMOutputWithPastrzASomething went wrong, the output of the model shouldn't be `None`)rNN) �modeling_outputsr�r�r��Tensorr��logitsr"r�� mark_sharding)�output�meshr�� real_outputs r�� shard_outputz)Trainer._wrap_model.<locals>.shard_outputs���H�H�H�H�H�H�"&�K�!�&�%�,�7�7�4�&,� � �#�F�E�2�2�4�&,�Q�i� � �#�F�,B�C�C�4�&,�m� �"�*�(�)l�m�m�m��$�[�$�8L�M�M�M�M�Mr�)r��auto_wrap_policyr�)r�r�c�J�|jdi|��}|rtj��|S)Nr�)rjr�� mark_step)r[�barrier�optimizer_args�losss r��patched_optimizer_stepz3Trainer._wrap_model.<locals>.patched_optimizer_step9s1��%�y�~�7�7��7�7���#��L�N�N�N�� r��SMDATAPARALLEL_LOCAL_RANK)� device_ids�find_unused_parametersT� bucket_cap_mb�broadcast_buffersr�)Ir��use_ipexrsr�rzr�r�r�r�rUrvr��DistributedModelr�rVr)rrr�� initializer[�fp16_opt_levelr�r;r� DataParallel� jit_mode_eval�timerz�round�jit_compilation_timerI�torch_xla.distributed.fsdpr�r��torch_xla.distributed.fsdp.wrapr�r�r��7torch_xla.experimental.spmd_fully_sharded_data_parallelr�rDrHr�� functools�partialr=rE� Exception�add�xla_fsdp_config�config� use_cacherr�r��optimizer_stepr��parallel�DistributedDataParallelr�ri�getenvrKrfrLr��ddp_find_unused_parametersr'�is_gradient_checkpointing�ddp_bucket_cap_mb�ddp_broadcast_buffersr�� ddp_handler)r�r�r�rr�� start_timer�r�r�r��%default_transformer_cls_names_to_wrap�"fsdp_transformer_layer_cls_to_wrap�transformer_cls_to_wrap� layer_class�transformer_cls� fsdp_kwargsr�r�r�r�r�r�s` @@@r�� _wrap_modelzTrainer._wrap_model�s������� �9� � K�&*�&6�I�E�N�N�E�M�E��,�,�U�H�E�,�J�J�E� "� $� $� o��$�,�c�i�.H�I�I� *��)�)��'��� �Hm�n�n�n� n� � � (� (�� /� /�u� <� <��L� �=� n�X� n�$'�N�5�$�.�TX�T]�Tl�$m�$m�$m� !�E�4�>� �9�?�Q� � �w�u�6I�5�'Q�'Q� ��O�E�*�*�E� �9� "� K�����J��-�-�e�Z��J�J�E�(-�d�i�k�k�J�.F��(J�(J�D� %�� ��L� � #�u S� p�Z�Z�Z�Z�Z�Z�H�H�H�H�H�H��������� �.����������� p� p� p�!�"n�o�o�o� p����#� �$(� !�4;�E�CV�X\�4]�4]� 1�15��1F�1J�1J�/�1V�2�2� .��y�$�%5�6��:�:�#,�#4�/�� �@U�Vf�@g�$�$�$� � �4�?�*-�%�%�'�#E�E�E�K�&@�� �&T�&T�O�&�.�'�(j�k�k�k�/�3�3�O�D�D�D�D�#,�#4�0�*A�$�$�$� � �)�3�K��y�$�%9�:� M��<�)�3��'�'�r����.3�E�L�*�M�M�M�M�M�M�M�M� �*� � N� N� N�&,�V��!-�%5�*?� &�&�&��� �U�U�&*�T��&�%5�*?�&�&�"� &�&��� �U�;@�PR� � � � � !7�B� � � $� &� &� S��K�7�7��3�r�y�1L�'M�'M�#N�#N�"O�8���E�E��Y� $� �(@� @� @�,�.�.� �� ��F��y�3�?�37�9�3W��/�0�0��E�?�3�3� 8�8=�7V�3V��/�0�0�37��/�0��y�*�6�*.�)�*E���'��y�.�:�.2�i�.M��*�+�+H�+R�+R�6�+R�+R�D� � (�� s �!F9�9G�resume_from_checkpoint�ignore_keys_for_evalc �j�|durd}|j���|j}d|_|j�|�|j��|_|js|jr5|j s.|j s'|j � |� |j|j ��d|vr/|�d��}tjdt"��t%|��dkrEt'dd�t+|��������d ����|�|��|jj|_d}|j �i|jjrt7|jj��nt;|jj��|�|��|_d}d \|_|_ tC|tD��r0|r.tG|j$��}|�tKd |j$�d ����|�{tM��s#|j's|j(s|�)|��tUj+tXj-�|t\����}|j� |j|_|r3|j/r |� |j|j ��|j|_0tc|j2|j|j3��}|j4rN tkj6��|||||� ��tkj7��S#tkj7��wxYw|||||� ��S)a� Main training entry point. Args: resume_from_checkpoint (`str` or `bool`, *optional*): If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here. trial (`optuna.Trial` or `Dict[str, Any]`, *optional*): The trial run or the hyperparameter dictionary for hyperparameter search. ignore_keys_for_eval (`List[str]`, *optional*) A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions for evaluation during the training. kwargs (`Dict[str, Any]`, *optional*): Additional keyword arguments used to hide deprecated arguments FNT� model_pathzi`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` instead.rz*train() got unexpected keyword arguments: r�rhr�z/No valid checkpoint found in output directory (�))r�r�rr�)8r/r0r�r,rZr�r�rNrOrPr<r�rSr�r6r6r7r8r?rr�r�r!r8r�r�r(r\r)rcr4r[r\r�r�r^r�r"r�rJrQ�_load_from_checkpointr9�load_from_jsonrirTrXrMrUr]�_inner_training_loop�auto_find_batch_sizerf� hf_hub_utils�disable_progress_bars�enable_progress_bars) r�r�rr�r�r��model_reloadedrw�inner_training_loops r�rKz Trainer.train[sx��. "�U� *� *�%)� "� ��"�"�$�$�$��y����� � #� /��/�/�� �;�;�D�J� � � @�$(�$7� @��M� @��*� @���'� � &� &�t�z�4�;� ?� ?� ?� �6� !� !�%+�Z�Z� �%=�%=� "� �M��� � � � �v�;�;��?�?��j����SW�X^�Xc�Xc�Xe�Xe�Sf�Sf�Ig�Ig�j�j�j�k�k� k� ���e�$�$�$�!%��!;����� �?� &�7;�y�7Q� o� #�D�I�N� 3� 3� 3�W_�`d�`i�`n�Wo�Wo���-�-�e�4�4�D�J�!�N�0:� -�D�N�D�-� �,�d� 3� 3� g�8N� g�%8���%I�%I� "�%�-� �!e�SW�Sb�!e�!e�!e�f�f�f� !� -�*�,�,� C�T�5N� C�W[�Wk� C��*�*�+A�B�B�B� �/��� � �=S�Ug�0h�0h�i�i�E��%�1�).�)?��&� � ,��)� D��*�*�4�:�t�{�C�C�C�!%��D� �8� � %�t�'=�t�?X� � �� � � � 4��2�4�4�4�*�*��+A��)=� ����1�3�3�3�3�� �1�3�3�3�3����&�&��'=��%9� ��� s �!L�L#c �*�|j���||_|jjr�|jj|jkr�ddlm}||j ��\|_ |j |_ |j rY|jj }|jtd|jj��z|j_ |�d��||j_ |j|j_t �d|j����|���}|jrt)|��}|j|jz|jz} |�||| ��\} } } } }}}d}|jjr3|�||rdn|��}|� |r ||jz}n ||jz}t6j|jjvr3|jjdkrt;d���t=|j ��}t?��p |j p|j!}|j!o#tE|jjj#dd��dk}|rd }|j$rd|_%d |_$|j rtM||� ��\|_'|_%|s|�(|� ��tSd �|j*j+|j,gzD��� ��|_|du|j_-|j|j_|j�.||��|j/r |j �0|j1� ��|�2|j ��}||j urdnd }|r"|j!rtg|j d���|_ |r`|rH|�4��|jj5dkr$|j�6|j ��|_ |�(|� ��|r�|j �7��tq|j%d��rU|j9r |j�6|j ��}n�|j�6|j |j'��\}|_'n�|j�6|j |j'|j%��\}|_'|_%nH|jj:tvj<tvj=fvr$|j�6|j'��|_'|j!r|x|_ |_ ||j ur||_ |j r |j |_>|�b|j r+t|j |t�|j �� ���n0t?��s|j!r|�A||j ��|�B|��|�C|��t �Dd��t �Dd| d����t �Dd| d����t �Dd|jj d����|jj |jkr#t �Dd|jd����t �Dd| d����t �Dd|j����t �Dd|d����t �Ddt�|d���d����d|j_Ft�jG��}d}d}d}|��xt�jI�Jt�jI�K|t������r5tSjMt�jI�K|t�����|_|�N|j|j��|�O��t�|jjQ| z��}|jRs|jjQ| z}||jz}nd}t �Dd��t �Dd|����t �Dd|jjQ����|jRs!t �Dd |�d!|�d"���d#D]&}t�|j*|tE||�����'||j*_T|j�U||| |��t�jWd$|jX�%��}d$|_Y|jjQ|_Z|�[��d}d}|j*�\||j|j,��|_,|j]r|�^||d�&��t�|| ��D�]�}|} tq| d'��r| �`|��|jadkrd|_b|�t�| ��n|jd|jz}!|j*�e||j|j,��|_,||kr|�|dkr|�f|��d }"d}#|dkrt�| |��} |}#d}d}"d(}$t�| ��}%| |jz}&|&dkr|j}&d(}'|!|jzdz}(|jdkr|(dz}(t�|(��D�]�})|'dz }'|'|(dz kr|jn|&}*|�i|%|*|jX��\}+},t�|+��D�]%\}-}.|$dz }$|$dz|jzdkp|$dz|!k}/|jjk�l|/��|jjmr�tE|j d)d*��}0|0|.vrt �nd+��n�|.|0�o��}1t�jW|1|jjXt�jp�,��}1|jxjq|j�r|1���s���t��z c_q|"r|�f|��d }"|dkr9|dz}|�|�ud��|dkr|�f|����s|�|�v��d}|$|jzdkr+|j*�w||j|j,��|_,|-t�|+��dz kr:|jjxt�jzkr t�j||jj}|�-��n t�j}2|2��5|��||.|,��}3ddd��n #1swxYwY|j�rU�t��sFt�j�|3��st�j�|3��r||d|jjQz|jZz z z}n4|jX|3jXkrt;d.|jX�d/|3jX�����||3z}|xj��t |��|.����z c_�|/�r�|jjk�ld��|j���|j�dk�rt?��r'|j�r |j'��|j���}4nx|j9r?�tj����tj�|j'��|j���}4n2|j��|����|j���}4�t#��rS|jjxt�jzkr9|����}tq|d0��r|�t��}n|4}|j*��||j|j,��|_,|j'����|j*��||j|j,��|_,|����}|jj�sC�t1|j%t�j:j%j���s|j%����|�[��|jxjQdz c_Q||$dz|#z|!z z|j_F|j*��||j|j,��|_,|��||||||||�1��n+|j*��||j|j,��|_,|j,j�s |j,j�r%�t��r�t?j���n��'|j,j�s |j,j�r%�t��r�t?j���n���|$dkr7t �nd2|jjQ�d3|�d4���d|j,_�|j*��||j|j,��|_,|��||||||||�1��t6j�|jjvrQ�t��r(�t?j��tIj�����nt �nd5��|j,j�rn���|jar!tq|d6��r�tM|d6��t �Dd7��|j�r�|jj����t��r�t?j�d8��nM|j��tVj�kr�t[j���n"t?��r�t_j���|����|xjY|�t��z c_Yt|jjQd9��}5|jY|5z }6�tcd:|| |jjd|�;��}7|����|jj�|7d<<|6|7d=<d |_�|j���|7��|��|7��|��|��}8|��d |8�>��}9|jj�r�|jj��t|jj�dkrd|9D]a}:t�jI��|:|jj���s5t �Dd?|:�d@����t{j�|:d�A���b|j*��||j|j,��|_,|����|j��|��|j ���t�|jjQ|6|7��S)BNr)�release_memoryr Tz)Currently training with a batch size of: z�Currently --debug underflow_overflow is not supported under DP. Please use DDP (torchrun or torch.distributed.launch (deprecated)).� fsdp_version�F)rc�<�g|]}t|t���|��Sr�rrs r�r�z0Trainer._inner_training_loop.<locals>.<listcomp> rr�)r)�gradient_checkpointing_kwargs)� recursive�fp8rj��load_module_strictz***** Running training *****� Num examples = rXz Num Epochs = z( Instantaneous batch size per device = zA Training with DataParallel so batch size has been adjusted to: zE Total train batch size (w. parallel, distributed & accumulation) = z Gradient Accumulation steps = z Total optimization steps = z# Number of trainable parameters = )�trainable_onlyzE Continuing training from checkpoint, will skip to saved global_stepz! Continuing training from epoch z' Continuing training from global step z Will skip the first z epochs then the first z batches in the first epoch.)r�r[r\r(�r�)�skip_scheduler� set_epoch������main_input_namerz�Tried to track the number of tokens seen, however the current model is not configured properly to know what item is the input. To fix this, add a `main_input_name` attribute to the model class you are using.)r�r�r�z0Calculated loss must be on the original device: z but device in use is �item)r�zXThere seems not to be a single sample in your epoch_iterator, stopping training at step zI! This is expected if you're using an IterableDataset and set num_steps (z.) higher than the number of available samples.z�You enabled PyTorch/XLA debug metrics but you don't have a TPU configured. Check your training configuration if this is unexpected.�_pastzU Training completed. Do not forget to share your model on huggingface.co/models =) r&g����MbP?rK)� num_samples� num_stepsr� total_flos� train_loss�� use_mtimer��Deleting older checkpoint [�] due to args.save_total_limit�� ignore_errors)�rVr0r�r�r�rwr�r1r�rUr�rJrr�r��propagate_args_to_deepspeedrr?r�r�r%r�r�set_initial_training_values�include_tokens_per_secondrrmr�UNDERFLOW_OVERFLOWr"rr�rIrQr;� fsdp_pluginr�r\r"r[rr9r`r�r~�is_hyper_param_search� compute_steps�gradient_checkpointing�gradient_checkpointing_enabler�r�r)�_fsdp_qlora_plugin_updates�mixed_precisionr�rKrBrrr8rer�r�r+r#r�r��_load_optimizer_and_scheduler� _load_scalerrrD�epochr�rirT�isfiler�rXr�r��_load_callback_stater�rU�ignore_data_skipr/�train_dataloader�init_training_referencesr�rr��_total_loss_scalar�_globalstep_last_loggedrk�on_train_begin� eval_on_start� _evaluater�r� past_indexrr?rl�on_epoch_begin�_load_rng_stater�rj�get_batch_samples� enumerate�gradient_state�_set_sync_gradients�include_num_input_tokens_seenrCr-�int64�num_input_tokens_seen�gatherr=rru�close� on_step_begin�distributed_typer�� DEEPSPEEDr�r��no_syncr�r�� training_step�logging_nan_inf_filterr��isnan�isinfrr��floating_point_ops� max_grad_normr �clip_master_gradsrrn�clip_grad_norm_r�� master_paramsr�ru�get_global_grad_norm�on_pre_optimizer_steprj�on_optimizer_stepr��optimizer_step_was_skippedr��ReduceLROnPlateau� on_step_end�_maybe_log_save_evaluate�on_substep_end�should_epoch_stop�should_training_stopr�r�� on_epoch_end�TPU_METRICS_DEBUG� master_print�met�metrics_report�delattrr&�best_model_checkpoint� rendezvousrKrfrL�distr�rv�_load_best_modelrd� store_flosr r,r/r��log�_get_output_dir�_sorted_checkpointsrh�save_total_limit�samefile�shutil�rmtreerE�_finish_current_pushrZr�rX);r�r�r�r�rr�r�� original_bsr#�total_train_batch_sizerm�num_update_steps_per_epochr�num_train_samples� epoch_based�len_dataloaderrl�num_train_tokens�debug_overflow�delay_optimizer_creation�is_fsdp2r��use_accelerator_preparer��epochs_trained�steps_trained_in_current_epoch�steps_trained_progress_bar�attr�tr_loss� grad_normr�r�epoch_dataloader�steps_in_epoch� rng_to_sync� steps_skippedrj�epoch_iterator� remainder� update_step� total_updatesrb� num_batches� batch_samples�num_items_in_batch�i�inputs� do_sync_stepr� input_tokens�context� tr_loss_step� _grad_norm�effective_global_stepr r9�run_dir�checkpoints_sortedr>s; r�r�zTrainer._inner_training_loop�sv�� ��$�$�&�&�&�!+��� �9� )� A��z�*�d�.D�D�D�;�;�;�;�;�;�(6��t�7I�(J�(J�%��#�%)�Z��"��,�H�"&�)�"G�K�<@�<R�VY�Z[�]a�]f�]l�Vm�Vm�<m�D�I�9��4�4�T�:�:�:�<G�D�I�9�*.�*@�D�J� '�� � �Y��AW�Y�Y�Z�Z�Z��4�4�6�6�� � &� E�2�3C�D�D� � "&�!7�$�:Z�!Z�]a�]l�!l�� � ,� ,�T�3C�E[� \� \� � � &� � � � � � �� �9� .� E�#���/?��Ac���Zc�d�d� ��)�k�)� �D�$9�9� � �!�D�$D�D� � � )�T�Y�_� <� <��y���"�"�!�L���� "8�� �!C�!C��#:�#<�#<�#p��@X�#p�\`�\p� ��'�q�W�T�5E�5K�5W�Yg�ij�-k�-k�op�-p�� � -�',� $� � %� /� $�D� �).�D� &� � $� c�0>�t�Xa�0b�0b�0b� -�D�N�D�-�'� N� � /� /�9� /� M� M� M�!� � �!�2�<�� �~�M� � � � � � �� � ,1��+<�� �(�&*�&<�� �#� � � � ��y�1�1�1� � &� w� �J� 4� 4�SW�Su� 4� v� v� v�� � ��!3�4�4�� +0�4�:�*=�*=�$�$�5�� "� B�t�';� B�&�d�j�D�A�A�A�D�J� #� N�&� F��/�/�1�1�1��#�3�u�<�<�!%�!1�!9�!9�$�*�!E�!E�D�J� � /� /�9� /� M� M� M� #� F� �J� � � � � ��t�(�&�1�1� ��=�a� �,�4�4�T�Z�@�@�E�E�,0�,<�,D�,D�T�Z�QU�Q_�,`�,`�)�E�4�>�>�<@�;K�;S�;S��J����0A�<�<�8��t�~�t�'8�'8��Y�_��!4�n�6L� M� M� M�!�-�5�5�d�n�E�E�D�N� � � 4�.3� 3�D�J��+� �� � "� "�!&�D� � � $� 0�!�/�D�N� "� -��(� W�)��&�(>�We�fj�fp�Wq�Wq�Sq������)�*�*� W�d�.B� W��*�*�+A�4�CU�V�V�V� �*�*�+A�B�B�B� ���0�1�1�1� � � �2�3�3�3�� � �8� �8�8�8�9�9�9�� � �:�&6�:�:�:�;�;�;�� � �h�t�y�?d�h�h�h�i�i�i� �9� 0�D�4J� J� J� �K�K�v�\`�\r�v�v�v� w� w� w�� � �v�\r�v�v�v�w�w�w�� � �Y�t�7W�Y�Y�Z�Z�Z�� � �A�I�A�A�A�B�B�B�� � �o�:O�PU�fj�:k�:k�:k�o�o�o�p�p�p��� ���Y�[�[� ���)*�&�%)�"� "� -�"�'�.�.� �G�L�L�/�1C� D� D�3 �3 � -�&�4�R�W�\�\�BX�Zl�5m�5m�n�n�D�J� � 4� 4�T�Y�� � K� K� K� � %� %� '� '� '� ���!7�;U�!U�V�V�N��(� 3�15��1G�Ke�1f�.�.�$�2R�R�.�.�12�.� �K�K�_� `� `� `� �K�K�L�N�L�L� M� M� M� �K�K�Z�$�*�BX�Z�Z� [� [� [��(� �� � �U�^�U�U�6�U�U�U���� ;� F� F�D� �D�)�4���t�1D�1D� E� E� E� E�1A���.� � �+�+�D�)�=M�u�U�U�U��,�s�4�;�7�7�7��"%���'+�z�'=��$� ������%)� �� ��,�;�;�D�$�*�d�l�[�[�� � � � M� �N�N�5�"6�t�N� L� L� L��>�+;�<�<�I �I �E�/� ��'��5�5� 2� �*�*�5�1�1�1���!�#�#�!�� �"�-��$�%�%�%��^�d�&F�F� �  �0�?�?��d�j�RV�R^�_�_�D�L���&�&�+A�+M�Rp�tu�Ru�Ru��$�$�%;�<�<�<��K��M�-��1�1�#5�6F�Hf�#g�#g� � >� �12�.�"� ��D�!�"2�3�3�N�$�t�'G�G�I��A�~�~� �<� ��K�*�d�.N�N�QR�R�M��/�1�4�4���"� ��=�)�)�L �L ���q� � �BM�R_�bc�Rc�Bd�Bd�d�>�>�js� �48�4J�4J�>�[f�hl�hs�4t�4t�1� �1�!*�=�!9�!9�C�C�I�A�v��A�I�D�$(�1�H��0P�#P�TU�#U�#u�Z^�ab�Zb�gu�Yu�L��$�3�G�G� �U�U�U��y�>� s�*1�$�*�>O�Q\�*]�*]��*�&�8�8�"�N�N�!b����� ,2�/�+B�+H�+H�+J�+J�L�+0�<� �T�Y�M]�ej�ep�+q�+q�+q�L� �J�<�<��@P�@W�@W�Xd�@e�@e�@i�@i�@k�@k�@p�@p�@r�@r�r�<�<�"�,��,�,�-C�D�D�D�&+� �6��9�9�6�!�;�6�5�A�6�=�=�a�@�@�@�9�Q�>�>� �0�0�1G�H�H�H� �3�?�2�8�8�:�:�:�59�2��d�>�>�!�C�C�'+�'<�'J�'J�4�QU�Q[�]a�]i�'j�'j�� � ��M� 2� 2�Q� 6�6�6� �,�=��AZ�Z�Z�"�)�$�*:�*B�%�P�P�P�P�(�3� � !����]�]�'+�'9�'9�%��I[�'\�'\� �]�]�]�]�]�]�]�]�]�]�]����]�]�]�]��3� 9� 6� 8� 8� 9�#�[��6�6� 9�;@�+�l�:S�:S� 9� #*�G�q�4�:�;Q�7Q�TX�Tp�7p�,q�"q���"�>�\�-@�@�@�",�!O�SZ�Sa�!O�!O�zF�zM�!O�!O�#�#��#*�L�"8���%�%��t�/F�/F�v�/N�/N�)O�)O�O�%�%�#�<l��(�7�K�K�D�Q�Q�Q� �-�9�d�>P�ST�>T�>T�6�8�8� "�T�Y� "�-1�^�-M�-M�d�N`�-a�-a� � �!%�� "�-/�X�-E�-E�$'�$5�d�n�$E�$E�$(�$6�."�."� � � .2�-=�-M�-M�$)�$4�$4�$6�$6�$(�$6�."�."� � !8� 9� 9� 7�$(�$4�$E��Ib�$b�$b�,1�,F�,F�,H�,H� �#*�9�f�#=�#=�!A�09���0@�0@�I��,6� �'+�'<�'R�'R�SW�Y]�Yc�ei�eq�'r�'r�� ���+�+�-�-�-�'+�'<�'N�'N�t�UY�U_�ae�am�'n�'n�� �)-�(?�(?�(A�(A� �#�/�J�9�#-�d�.?���AY�Ak�#l�#l�9� $� 1� 6� 6� 8� 8� 8����)�)�)�� �.�.�!�3�.�.�+0�D�1�H�}�4L�P^�3^�+^�� �(�'+�'<�'H�'H��t�z�[_�[g�'h�'h�� ��5�5�#�%�!�!�!�0�&�*7�6� � � � �(,�'<�'K�'K�D�RV�R\�^b�^j�'k�'k�� � �|�5����9Z��1�3�3�+��L�N�N�N���� �<�1��T�\�5V��-�/�/�'�� �����E���a�x�x����]�� �.�]�]�#,�]�]�]���� 59�� �1��0�=�=�d�D�J�PT�P\�]�]�D�L� � )� )���E�5�%�9M�z�iv� *� � � ��,�� ��?�?�)�+�+���O�C�$6�$8�$8�9�9�9�9��N�N�_�����|�0� ��� � �?� #�w�t�W�5�5� #� �D�'� "� "� "�� � �o�p�p�p� � &� $�4�:�+K�+W�%�'�'� �� �6�7�7�7�7��#�|�'?�?�?�� �����(�*�*� �� � � � � � !� !� #� #� #� ���7�<�<�>�>�1��� #�D�J�$:�E� B� B���,�/D�D� �� � �)��j�*�'�  � � �� ������ $� � 5�� �� *�� �� ��� ��4�4�W�=�=�=� ��������&�&�u�-�-��!�5�5��RY�5�Z�Z�� �9� � B�T�Z�%E�%Q�VZ�V_�Vp�tu�Vu�Vu�0� B� B� ��w�'�'� �D�J�4T�U�U�B��K�K� h�j� h� h� h�i�i�i��M�*�D�A�A�A�A���,�9�9�$�� �D�L�Y�Y�� � �!�!�#�#�#� � #� /� � $� $�T�Z� 0� 0� 0��4�:�1�:�w�G�G�Gs�>t" �"t& �)t& c��|j��|��|jtjkr|j}n�|jtjkr0ddl}|j������}nB|jtj kr|j }n%|jtj krddl }|j j }|j�|�|��nd|��}tj�|jj|��}n |jj}|S)Nrzrun-)r�rRr(�numberr*rGrK� get_context� get_trial_idr+�idr.r �runr*rirTr�r�r�)r�r�run_idrNr �run_namer�s r�rYzTrainer._get_output_dir� s��� � !� -�%�2C��%��)?�?�?������'�?�+>�>�>� � � � ���.�.�0�0�=�=�?�?����'�?�+A�A�A������'�?�+@�@�@�� � � �����.2�l�.F�t�|�|�E�*�*�*�O�SY�O�O�H��g�l�l�4�9�#7��B�B�G�G��i�*�G��r�c � ��|�|j}tj��t��}tj��t ��}tj��t ��}tj��t��}tj��t��}tj��t��}tj��t��} tj� ���oqt�fd�tj ���D����pDtj�tj��t�d�����} tj� ���r �fd�tj ���D��ng} | r|jst#d��d����td�|||| ||fD����s| s| st#d������t$�d��d ���tj�|��rNt)j|��} | j} | �1| t.kr&t$�d | �d t.�d ���tj�|��s"tj�|��s| �r�t3��r�tj�tj��d ����rt5j�tdd���dSt9|jd��r(|jjdurt$�d��t?j |dd���}d|d<|�!|d���}~dS|jr2tE|j#j$j%|j#|�fitM����dS|jj'rAtj�|��r"tPj�)|d���}nt?j |dd���}|�!|d��}~|�*|��dStW|���rZt9|d��st9|d���rt9|d���r tj�,���r�t9|d��r=|j-}t]|��dkrt$�d��|d}n|j/}| rX| D]>}tj��|��}|�0||||k����?|�1|��dS|�0�|d���dSt$�dt �d ���dSt$�d!��dSte|�t3��|jj'�"��}t3��s|�*|��dSdS)#Nc3��K�|]J}tj�tj��|�����?t|vV��KdSr)rirT�isdirr��FSDP_MODEL_NAME�r�� folder_namer�s �r�rz0Trainer._load_from_checkpoint.<locals>.<genexpr>� sa���������7�=�=�����.D�k�!R�!R�S�S���;�.������r�z.binc ���g|]�}tj�tj��|�����?tj�tj��|t ����sCtj�tj��|t ������|���Sr�)rirTr�r�r rjrir�s �r�r�z1Trainer._load_from_checkpoint.<locals>.<listcomp>� s���� � � ���7�=�=�����.D�k�!R�!R�S�S� � �G�N�N�2�7�<�<�0F� �Ui�#j�#j�k�k�  � �w�~�~�b�g�l�l�3I�;�Xq�&r�&r�s�s�  �� � � r�zCheckpoint found at z* is only supported when using PyTorch FSDPc3�TK�|]#}tj�|��V��$dSr)rirTr )r��fs r�rz0Trainer._load_from_checkpoint.<locals>.<genexpr>� sD���� � ������q�!�!� � � � � � r�z!Can't find a valid checkpoint at zLoading model from rhz9You are resuming training from a checkpoint trained with z- of Transformers but your current version is zJ. This is not recommended and could yield to errors or unwanted behaviors.�user_content.ptF�rT�tagr��load_optimizerr TzOEnabling FP16 and loading from smp < 1.10 checkpoint together is not supported.r��� map_location� weights_only�_smp_is_partialrgr�active_adapter�active_adapters� load_adapterr zFMultiple active adapters detected will only consider the first adapterr)rE�jThe intermediate checkpoints of PEFT may not be saved correctly, consider using a custom callback to save �i in corresponding saving folders. Check some examples here: https://github.com/huggingface/peft/issues/96�GCould not load adapter model, make sure to have `peft>=0.3.0` installed)rd� prefer_safe)3r�rirTr�rkrjrirornrmrlr�rY�listdirr r�rQr"rrr�from_json_file�transformers_versionrrCr�rvr�rBr�r r��load�load_state_dictr�rVrwrr��save_safetensors� safetensors� load_file�_issue_warnings_after_loadr��existsr�r?r�r�� set_adapterr()r�r�r�� config_file�adapter_weights_file�adapter_safe_weights_file� weights_file�weights_index_file�safe_weights_file�safe_weights_index_file� is_fsdp_ckpt�adapter_subdirsr��checkpoint_versionrZ� load_resultr�r�� subdir_name�peft_ids ` r�r�zTrainer._load_from_checkpoint� s���� �=��J�E��g�l�l�#9�;�G�G� �!�w�|�|�,B�DX�Y�Y��$&�G�L�L�1G�Ib�$c�$c�!��w�|�|�$:�L�I�I� ��W�\�\�*@�BT�U�U���G�L�L�)?�AR�S�S��"$�'�,�,�/E�G^�"_�"_���w�}�}�%;�<�<�  � �����#%�:�.D�#E�#E���� � � ^� �w�~�~�b�g�l�l�+A�o�C[�C[�C[�\�\�]�]� �*�w�}�}�3�4�4� � � � � �#%�:�.D�#E�#E� � � � �� � � x�� 4� x��v�4J�v�v�v�w�w� w� � � �!�%�&�+�(�-� � � � � � � [�� [�� [��Y�AW�Y�Y�Z�Z� Z�� � �C�*@�C�C�C�D�D�D� �7�>�>�+� &� &� �%�4�[�A�A�F�!'�!<� �!�-�2D� �2S�2S����=�Pb�=�=�@K�=�=�=���� �7�>�>�,� '� '�R =�2�7�>�>�:K�+L�+L�R =�P\�R =�&�(�(�( =��7�>�>�"�'�,�,�/E�GX�"Y�"Y�Z�Z�#��.�3��u�ej������� �t�y�&�1�1��d�i�n��6L�6L����m����"'��L�u�[_�!`�!`�!`�J�49�J�0�1�"'�"7�"7� �4�"7�"P�"P�K�"� � ��%� =���$�*�6��$��*� �� ,�-�-� ������9�-�a�"�'�.�.�AR�2S�2S�a�!,�!2�!<�!<�=N�W\�!<�!]�!]�J�J�!&��L�u�[_�!`�!`�!`�J�$�3�3�J��F�F� ���/�/� �<�<�<�<�<��E� "� "�% =���/�0�0� j�G�E�CT�4U�4U� j�[b��~�\�\� j��7�>�>�"8�9�9���u�&7�8�8�>�*/�*?����/�/�!�3�3�"�N�N�+s�t�t�t�)8��);���).�)=��&�f�+:�s�s�K�&(�g�l�l�3I�;�&W�&W�G�!�.�.�w� �S^�bp�Sp�.�r�r�r�r��)�)�.�9�9�9�9�9��*�*�+A�>�`d�*�e�e�e�e�e��N�N�b�DX�b�b�b������ ���h�i�i�i�i�i�2��-�6M�6O�6O�]a�]f�]w����K�+�,�,� =��/�/� �<�<�<�<�<� =� =r�c� �t�d|jj�d|jj�d���t j�|jjt��}t j�|jjt��}t j�|jjt��}t j�|jjt��}t��r|j n|j}|jr6t!|j |jjt#|j�� ���dS|jr<t'|jjj|j||jjfit-����}dSt j�|��s^t j�|��s?t j�|��s t j�|���rFd}t��r�t j�t j�|jjd����r)t3j|jjtdd���dS|jjrAt j�|��r"t:j�|d � ��}nt=j |d d� ��}d|d <|�!|d� ��}dSt#|���r�tE|d��stE|d���rQtE|d���r@tE|d��r@|j#d} tI|j#��dkrt�%d��n|j&} t j�|��st j�|��r� |�'|jj| ��nP#tP$rC} |j)| j*r+d|j)| j+j,�d�} tQ| ��| ��d} ~ wwxYwddl-m.} | gg��}n�t�%dt�d���d}n�t�%d��d}nz|jjrAt j�|��r"t:j�|d � ��}nt=j |d d� ��}|�!|d��}t��s|r|�/|��dSdSdSt j�t j�|jjt`����sLt j�t j�|jjtb����rOte||jjt��� ��}t��s|�/|��dSdSt�%d|�d���dS)NzLoading best model from z (score: rer�Tr�Fr�r�rr�r�rgr�r�r�rr zCDetected multiple active adapters, will only consider the first onez0When using prompt learning PEFT methods such as z�, setting load_best_model_at_end=True can lead to errors, it is recommended to set this to False and to load the model manually from the checkpoint directory using PeftModel.from_pretrained(base_model, <path>) after training has finished.)�_IncompatibleKeysr�r�r�z#Could not locate the best model at zi, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.)3rrrwrS� best_metricrirTr�rormrjrir�rUr�rJr#r�rQr�rVrr�r�r rvr�r�r�r�r�r�r�r�rBr�r?rCr�r�r5� peft_config�is_prompt_learning� peft_typer�torch.nn.modules.moduler�r�rlrnr() r��best_model_path�best_safe_model_path�best_adapter_model_path�best_safe_adapter_model_pathr�r��has_been_loadedrZr��exc�msgr�s r�rVzTrainer._load_best_model] s[��� � �t�t�z�/O�t�t�Z^�Zd�Zp�t�t�t�u�u�u��'�,�,�t�z�'G��V�V��!�w�|�|�D�J�,L�N_�`�`��"$�'�,�,�t�z�/O�Qe�"f�"f��')�w�|�|�D�J�4T�Vo�'p�'p�$�&=�&?�&?�O��"�"�T�Z�� � $�p � %��"�� �0�'5�d�j�'A�'A�#A� � � � � � � � !�j �)�� �&�2�� ��� �0� �� (�)�)� ��K�K�K� �G�N�N�?� +� +�b ��w�~�~�2�3�3�b ��w�~�~�5�6�6�b ��w�~�~�:�;�;� b � #�O�&�(�(�N A��7�>�>�"�'�,�,�t�z�/O�Qb�"c�"c�d�d�Q��.�!�Z�=�(� %�',� �������y�1�h�b�g�n�n�EY�6Z�6Z�h�%0�%6�%@�%@�AU�^c�%@�%d�%d� � �%*�Z��e�bf�%g�%g�%g� �49�J�0�1�"'�"7�"7� �4�"7�"P�"P�K�K�K�!�%�(�(�7K� ��'7�8�8�)0�G�E�K\�<]�<]�)0�cj��~�d�d�)0�#�5�*;�<�<�B�-2�-B�1�-E�N�"�5�#8�9�9�A�=�=� &���/t� u� u� u��-2�-A�N��7�>�>�*A�B�B�4�b�g�n�n�Uq�Fr�Fr�4�*� %� 2� 2�4�:�3S�Uc� d� d� d� d��#/� *� *� *�#(�#4�^�#D�#W� !*�)8�+0�+<�^�+L�+V�+\�)8�)8�)8�%(�+7�s�*;�*;��$D�$)����� *����R�Q�Q�Q�Q�Q�*;�*;�B��*C�*C�K�K�"�N�N�!j�L`�!j�!j�!j���� /4�O�O����'p�q�q�q�*/����y�1�h�b�g�n�n�EY�6Z�6Z�h�%0�%6�%@�%@�AU�^c�%@�%d�%d� � �%*�Z��e�bf�%g�%g�%g� � #(�"7�"7� �E�"J�"J�K�.�0�0�A�_�A��3�3�K�@�@�@�@�@�A�A�A�A� �W�^�^�B�G�L�L���)I�Kb�c�c� d� d� �hj�ho�hv�hv� �G�L�L���9�;M� N� N�i �i � �2��t�z�7�@W�@Y�@Y����K�+�,�,� =��/�/� �<�<�<�<�<� =� =� �N�N�P�o�P�P�P� � � � � s�% P� Q�>Q�Qc��t|j��dkrx|jj�It |j��t |jj��kr|j���n#t �d|j�d���t|j��dkr%t �d|j�d���dSdS)Nrz8There were missing keys in the checkpoint model loaded: rhz;There were unexpected keys in the checkpoint model loaded: ) r?� missing_keysr��_keys_to_ignore_on_saver=r�rrC�unexpected_keys)r�r�s r�r�z"Trainer._issue_warnings_after_load� s��� �{�'� (� (�A� -� -��z�1�=�#�k�F^�B_�B_�cf�� �2�d�d�C�C�� �&�&�(�(�(�(����u�Ze�Zr�u�u�u�v�v�v� �{�*� +� +�q� 0� 0� �N�N�l�k�Ni�l�l�l� � � � � � 1� 0r�c ���|�|���}|�||jj|��t |jt jjj��r�|s�|j j }|� d��sd|��} |j� ||��nK#t$r>}td|�dt|������d|�d���|�d}~wwxYw|S)N�� ignore_keys�eval_�9The `metric_for_best_model` training argument is set to '�W', which is not found in the evaluation metrics. The available evaluation metrics are: zX. Please ensure that the `compute_metrics` function returns a dictionary that includes 'zM' or consider changing the `metric_for_best_model` via the TrainingArguments.)�evaluaterPrwrUr�r\r�r8rGr�r'� startswithrjrr�r!)r�rr�rr9�metric_to_checkr�s r�r)zTrainer._evaluate� s5���-�-�,@�-�A�A�� �!�!�%���)?��I�I�I� �d�'���)A�)S� T� T� �]k� �"�i�=�O�"�-�-�g�6�6� <�";�/�";�";�� ��!�&�&�w��'?�@�@�@�@��� � � ��`�P_�`�`�=A�'�,�,�.�.�=Q�=Q�`�`�n}�`�`�`��� � ����� �����s� B+�+ C3�59C.�.C3c ��|jj�r>|jj|jk�r(t ��rt j��i} |�|��� ��� ��} ||z}t| |jj|jz z d��| d<|�3t|tj��r|� ��n|| d<|�|| d<n|���| d<|xj| z c_|jj|_|���|�| |��d} |jjrS|�||��} |�| |���} |jjt0jkr | |j_|jjrH|�||��|j�|j|j|j��|_dSdS)Nr�r�rpr�)r9r)r~� should_logrwrUr&r�r�r��_nested_gather�meanrr�r�r�r�r�r%rWrX�should_evaluater)�_determine_best_metricr�r$rVr%rh�_save_checkpointr`�on_save) r�rorpr�rrr�r�r��logs�tr_loss_scalarr9�is_new_best_metrics r�rIz Trainer._maybe_log_save_evaluate� s��� �<� "� '�t�z�'=��@\�'\�'\�%�'�'� �� ����%'�D�"�0�0��9�9�>�>�@�@�E�E�G�G�N� �w� �G� ��4�:�3I�D�Lh�3h�!i�kl�m�m�D��L��$�8B�9�e�l�8[�8[�$j�I�N�N�$4�$4�$4�aj��[�!��(�(5��_�%�%�(,�(?�(?�(A�(A��_�%� � #� #�~� 5� #� #�+/�:�+A�D� (� �O�O� � � � �H�H�T�:� &� &� &��� �<� '� >��n�n�U�,@�A�A�G�!%�!<�!<�W�TY�!<�!Z�!Z� ��y�&�,�*;�;�;�+=�� �(� �<� #� ^� � !� !�%�� /� /� /��0�8�8���D�J�PT�P\�]�]�D�L�L�L� ^� ^r�c�P�|�dS|jjdkrp|jj}tj�|d|�d���}tj�|��s t�d|�d���dSn[tj�|d��}tj�|��st�d��dSt��5tj |d� ��}ddd��n #1swxYwYtj |d ��tj �|d ��tj �|d ��t#��rt%j|d ��|jjt(jk}tj���rt1dtj||��t3��rt1dtj||��t7��rt1dtj||��t;��rt1dtj||��t?��rt1dtj ||��dSdS)Nr � rng_state_�.pthz$Didn't find an RNG file for process zr, if you are resuming a training that wasn't launched in a distributed fashion, reproducibility is not guaranteed.� rng_state.pthz�Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.T�r��python�numpyr�r��CUDA�NPU�HPU�MLU�MUSA)!r�r� process_indexrirTr�r rrr�r�r�r��setstater�� set_state� set_rng_stater�r�rKrfrL�cuda� is_availablerMr��npur��hpur��mlur��musa)r�r>r��rng_file�checkpoint_rng_state�is_distributeds r�r,zTrainer._load_rng_state" s��� � � �F� �9� �!� #� #� �I�3�M��w�|�|�J�0P�]�0P�0P�0P�Q�Q�H��7�>�>�(�+�+� �� � �c�=�c�c�c������  ��w�|�|�J��@�@�H��7�>�>�(�+�+� �� � �B������ �^�^� K� K�#(�:�h�T�#J�#J�#J� � K� K� K� K� K� K� K� K� K� K� K���� K� K� K� K���,�X�6�7�7�7� � ���0��9�:�:�:� � �"�"�#7��#>�?�?�?� !� #� #� :� � �1�%�8� 9� 9� 9���0�L�4L�L�� �:� "� "� $� $� _� $�V�U�Z�9M�~� ^� ^� ^� !� #� #� ]� $�U�E�I�7K�^� \� \� \� !� #� #� ]� $�U�E�I�7K�^� \� \� \� !� #� #� ]� $�U�E�I�7K�^� \� \� \� "� $� $� _� $�V�U�Z�9M�~� ^� ^� ^� ^� ^� _� _s�.D�D�Dc ��d}|jj��:|jj}|�d��sd|��} ||}nH#t$r;}td|�dt |������d���|�d}~wwxYw|jjr tjn tj }|j j �4|jjrtd��ntd��|j _ |||j j ��rH||j _ |jj tjtjfvr|j j|j _d }|S) z� Determine if the model should be saved based on the evaluation metrics. Returns: bool: True if a new best metric was found, else False FNr�r�r�zJ. Consider changing the `metric_for_best_model` via the TrainingArguments.z-inf�infT)r�r'r�rr�r!�greater_is_betterr��greater�lessrwr�r�r$rV�STEPS�EPOCHrU�best_global_step)r�r9rr�r�� metric_valuer��operators r�r�zTrainer._determine_best_metricM sh��#�� �9� *� 6�"�i�=�O�"�-�-�g�6�6� <�";�/�";�";�� �&��7� � ��� � � ��^�P_�^�^�=A�'�,�,�.�.�=Q�=Q�^�^�^��������� ���� &*�Y�%@�M�r�z�z�b�g�H��z�%�-�:>�)�:U�)g��v����[`�af�[g�[g�� �&��x� �d�j�&<�=�=� *�)5�� �&��9�*�|�/A�<�CU�.V�V�V�26�*�2H�D�J�/�%)�"�!�!s�A� B� 6B�Bc��t�d|jj��}|j�|�|���|�|���}t j�||��}|� |d���|j j tj tjfvrm|jjrat�d|jj��}t j�||��}t j�|��r ||j_|j js?|�|��|�|��|�|��|j jr�d�|jj|jgzD��D]|}|jj} |���} t9|jj| t<��r&|jj| �| ���m| |jj| <�}|j� t j�|tB����|j j"r|�#|��|j jr|�$d|���dSdS)Nrc)rTrRc�<�g|]}t|t���|��Sr�rrs r�r�z,Trainer._save_checkpoint.<locals>.<listcomp>� s:������Q[�\^�`o�Qp�Qp�����r�Fr )%rNrwrUr�rWrYrirTr�rVr�r$rVrrrr�rS�save_only_model�_save_optimizer_and_scheduler� _save_scaler�_save_rng_staterhr`r�r~r9r:r�rr�rtrWrXrf�_push_from_checkpoint�_rotate_checkpoints) r�r�r�checkpoint_folderr�r��best_checkpoint_folder�best_checkpoint_dirr�cb_name�cb_states r�r�zTrainer._save_checkpoints su�� 5�O�O�t�z�7M�O�O�� � !� )�e�m� �O�O� � � ��&�&�U�&�3�3���W�\�\�'�+<�=�=� � ��� �4��8�8�8� �9� "�|�'9�<�;M�&N� N� N�SW�S]�Sn� N�(=�%]�%]�� �@[�%]�%]� "�"$�'�,�,�w�8N�"O�"O� ��w�~�~�1�2�2� G�3F�� �0��y�(� -� � .� .�z� :� :� :� � � �j� )� )� )� � � �� ,� ,� ,� �9� � R���!�2�<�� �~�M���� F� F���,�/���8�8�:�:���d�j�;�G�D�d�K�K�F��J�1�'�:�A�A�(�K�K�K�K�=E�D�J�1�'�:�:� �J� #� #�B�G�L�L��=O�$P�$P� Q� Q� Q� �9� � 3� � &� &�z� 2� 2� 2� �9� � J� � $� $�u�� $� I� I� I� I� I� J� Jr�c ���tj��tj���tj���d�}tj���rg|jj tj kr'tjj� ��|d<n&tjj���|d<t��rtj��|d<t��rg|jj tj kr'tjj� ��|d<n&tjj���|d<t#��rg|jj tj kr'tjj� ��|d<n&tjj���|d<t'��rg|jj tj kr'tjj� ��|d<n&tjj���|d<t+��r]|jj tj kr"tj� ��|d<n!tj���|d<t/j|d� ��|jjd kr5t j|t.j�|d ����dSt j|t.j�|d |jj�d �����dS)N)r�r�r�r�r�r�r�r�r�Tr r r�r�r�)r��getstater�� get_stater�� get_rng_stater�r�r�rKrfrL�get_rng_state_allr�r�r�r�r�r�r�r�r�r�rirjrrYrTr�r�)r�r�� rng_statess r�r zTrainer._save_rng_state� s����o�'�'��Y�(�(�*�*��<�-�-�/�/� � � � �:� "� "� $� $� G��y�&�,�*B�B�B�%*�Z�%6�%H�%H�%J�%J� �6�"�"�%*�Z�%6�%D�%D�%F�%F� �6�"� !� #� #� 3� "� 0� 2� 2�J�u� � !� #� #� E��y�&�,�*B�B�B�$)�I�$4�$F�$F�$H�$H� �5�!�!�$)�I�$4�$B�$B�$D�$D� �5�!� !� #� #� E��y�&�,�*B�B�B�$)�I�$4�$F�$F�$H�$H� �5�!�!�$)�I�$4�$B�$B�$D�$D� �5�!� !� #� #� E��y�&�,�*B�B�B�$)�I�$4�$F�$F�$H�$H� �5�!�!�$)�I�$4�$B�$B�$D�$D� �5�!� "� $� $� @��y�&�,�*B�B�B�%*�Z�%A�%A�%C�%C� �6�"�"�%*�Z�%=�%=�%?�%?� �6�"� � �J��.�.�.�.� �9� �1� $� $� �J�z�2�7�<�<� �O�#L�#L� M� M� M� M� M� �J�z�2�7�<�<� �<f���I`�<f�<f�<f�#g�#g� h� h� h� h� hr�c � �t���r�tjd��|jr�|j���|j���d�}tj|tj � |d|j j �d|j j�dt����d���nOtj|j���tj � |t����t!jd� ��5}tj|j���tj � |t&����t)|��ddd��n #1swxYwY�nt+��r�|j�d� ��}t/j��t/j��d kst.jjjrOt/j|tj � |t��dt.jjj� ���nb|jr�d t=t?j |j!j"��j#�$����v}|r1tK|j��r|j!�"|d���n�|j!�"|��n�|j&rgtO|j(jj)|j(|j|fitU����tW|j(jj)|j(|j|j|��n[|j j,rOt[j|j���tj � |t����|jot]|jt^�� }|j j,r�|jr|r�t��s�t!jd� ��5}t[j|j���tj � |t&����ddd��n #1swxYwYt)|��dSdSdSdS)N�saving_optimizer_states)r[�shard_metadatar��-of-rcF�� master_onlyT��record)�gather_if_shardr)r��v3�exclude_frozen_parameters)r$)0r�r�rTr�r[rZr��get_shard_metadatarYrirTr�r�r�rr[r6�catch_warningsr\r\rKr��local_state_dictrvr��rdp_rankrwrx�shard_optimizer_staterJr=r�r�rU�save_checkpointr�r!r�rQr�rVrr�r�rhr�r�r�)r�r��optm�caught_warnings�opt_state_dict� accept_exclude_frozen_parameters�is_deepspeed_custom_schedulers r�r z%Trainer._save_optimizer_and_scheduler� s��� !� #� #�1 ^� �M�3� 4� 4� 4��*� _�!%��!:�!:�!<�!<�&*�j�&C�&C�&E�&E��������G�L�L�"�$o�4�9�+B�$o�$o�� �H\�$o�$o�_m�$o�$o���!&� ���������1�1�3�3�R�W�\�\�*�n�5]�5]�^�^�^��(��5�5�5� 5�����)�4�4�6�6��� � �Z�Q_�8`�8`�a�a�a�#�O�4�4�4� 5� 5� 5� 5� 5� 5� 5� 5� 5� 5� 5���� 5� 5� 5� 5��%� &� &� ^�!�^�<�<�U�<�S�S�N� �K�M�M�M��|�~�~��"�"�c�i�m�&I�"���"��G�L�L��^�<�<� ��y�}�:� ����� � &� ^�0K�c��!�$�"4�"D�E�E�P�U�U�W�W�O�O�0� ,�0� ?�N�4�:�4N�4N� ?��"�2�2�:�Y]�2�^�^�^�^��"�2�2�:�>�>�>�>� � !� ^� �� �&�2�D�4D�d�j�R\� � �`u�`w�`w� � � � �� �&�2�D�4D�d�n�VZ�V`�bl� � � � ��Y� "� ^� �J�t�~�0�0�2�2�B�G�L�L��^�4\�4\� ]� ]� ]�)-�(A�) �*� � �8�K �K �G �%� �I� !� 1��.� 1�2O� 1�+�,�,� 1� �(��5�5�5� e��� �4�,�7�7�9�9�2�7�<�<� �Tb�;c�;c�d�d�d� e� e� e� e� e� e� e� e� e� e� e���� e� e� e� e� �� 0� 0� 0� 0� 0� 1� 1� 1� 1� 1� 1s&�AF�F � F �AQ$�$Q(�+Q(c �j ����dS|jr�t|jt��s�t jd���5}|j�tjtj � �t��d�����ddd��n #1swxYwYt|��dSt��r:tjtj � �t ��dz��n�tj �tj � �t ����p�tj �tj � �t$����pKtj ����o,t)�fd�tj���D����}|jrGtjtj � �d|jj�dt ������n|}|�r�tj �tj � �t�����rOt3���r�|jr`tjtj � �d |jj�d |jj�dt ����d d� ��}|d }n:tjtj � �t ��d d� ��}t jd���5}tjtj � �t��d d� ��}ddd��n #1swxYwYt|��t7j||jj��t7j||jj��|j�|��|j�|��dSt��rctj �tj � �d����r�fd�}n�fd�}|j� |��n�|jjdkr |jjnd }|j!r<tE|j#j$j%|j#|j|j&�fitO����nR|j�tjtj � �t ��|d� ����t jd���5}|j�tjtj � �t��d�����ddd��n #1swxYwYt|��dSdSdS)z3If optimizer and scheduler states exist, load them.NTr r��_*c3���K�|]c}tj�tj��|�����?t�d��d|vV��ddS)rhrN)rirTr�r��OPTIMIZER_NAME_BINr�)r�r�r>s �r�rz8Trainer._load_optimizer_and_scheduler.<locals>.<genexpr>1 ss�������'��7�=�=�����j�+�)N�)N�O�O��*�0�0��5�5�a�8�K�G������r�z rank*-of-rcr�rr�r�r[r�c���|�tjtj��t ��d�����dS)NT�r�)r�rvr�rirTr�r[��mod�optr>s �r�� opt_load_hookz<Trainer._load_optimizer_and_scheduler.<locals>.opt_load_hook_ s>����/�/�������j�R`�9a�9a�ko�0p�0p�0p�q�q�q�q�qr�c�L��trO|�tjtj��t��dd�����dS|�tjtj��t��d�����dS)NT)r�� back_compatr5)rur�rvr�rirTr�r[r6s �r�r9z<Trainer._load_optimizer_and_scheduler.<locals>.opt_load_hookd s����8�v� #� 3� 3�$'�H�R�W�\�\�*�n�-U�-U�_c�qu�$v�$v�$v�!"�!"�!"�!"�!"�!$� 3� 3�C�H�R�W�\�\�*�Vd�=e�=e�os�4t�4t�4t� u� u� u� u� ur�r )(rJr�r\r�r6r&r�r�r�rirTr�r\rKr��globr[r r3r�rYr�r�r�rr�r�r��send_cpu_data_to_devicer�r[rU�register_post_step_hookrQr�rVrwrr�r�)r�r>r,�checkpoint_file_exists�optimizer_state�lr_scheduler_stater9r�s ` r�rz%Trainer._load_optimizer_and_scheduler s&��� � � �F� � $� ��d�/�1J�K�K� 5��,�D�9�9�9��_��%�5�5�� �2�7�<�<� �N�#K�#K�Z^�_�_�_����������������������$�O�4�4�4� �F�'�(�(� �D�I�b�g�l�l�:�~�>�>��E� F� F� F�����r�w�|�|�J��G�G�H�H� ��7�>�>�"�'�,�,�z�;M�"N�"N�O�O� ��G�M�M�*�-�-�������+-�:�j�+A�+A������ �$�*� (�D�I�b�g�l�l�:�/b�4�9�;O�/b�/b�R`�/b�/b�c�c� d� d� d�'� � "�G 5�b�g�n�n�R�W�\�\�*�n�5]�5]�&^�&^�G 5�%�'�'�E 5��.� �&+�j��� � �&�(s�t�y�/F�(s�(s�D�I�L`�(s�(s�cq�(s�(s���&+�%)� '�'�'�O�'6�k�&B�O�O�&+�j��� � �Z��@�@�u�cg�'�'�'�O��,�D�9�9�9��_�).���� � �Z��@�@�u�cg�*�*�*�&�������������������$�O�4�4�4��*�?�D�I�<L�M�M�M��*�+=�t�y�?O�P�P�P���.�.��?�?�?��!�1�1�2D�E�E�E�E�E�*�,�,�$��w�~�~�b�g�l�l�:�?P�&Q�&Q�R�R� v�r�r�r�r�r�r� v�v�v�v�v��&�>�>�}�M�M�M�M� 8<�y�7K�a�7O�7O�4�9�#3�#3�UZ�L��+��+� �,�2�>� �,� �N� �J�&� �� 4�5�5� ������6�6�!�J� "�� � �Z�� H� H�Wc�rv������� �,�D�9�9�9��_��%�5�5�� �2�7�<�<� �N�#K�#K�Z^�_�_�_����������������������$�O�4�4�4�4�4�OG 5�G 5�G 5�G 5s7�AB�B�!B�);M0�0M4�7M4�7AV�V�Vc� � |jj}n#t$rYdSwxYw|�dSt��r�t jd��t jd���5}t j|jj� ��tj � |t����t|��ddd��n #1swxYwY|jjr�t��s�t jd���5}t#j|jj� ��tj � |t����ddd��n #1swxYwYt|��dSdSdS)N�saving_scaler_stateTr )rV�scalerrr�r�rTr6r&rYrZrirTr�� SCALER_NAMErKr�rhr�)r�r�rDr,s r�r zTrainer._save_scaler� s��� ��%�,�F�F��� � � � �F�F� ���� �>� �F� !� #� #� 5� �M�/� 0� 0� 0��(��5�5�5� 5�����(�/�:�:�<�<�b�g�l�l�:�Wb�>c�>c�d�d�d�#�O�4�4�4� 5� 5� 5� 5� 5� 5� 5� 5� 5� 5� 5���� 5� 5� 5� 5� �9� � 1�)?�)A�)A� 1��(��5�5�5� h��� �4�+�2�=�=�?�?�����j�Ze�Af�Af�g�g�g� h� h� h� h� h� h� h� h� h� h� h���� h� h� h� h� �� 0� 0� 0� 0� 0� 1� 1� 1� 1s2� � ��A$C � C�C�AE&�&E*�-E*c �H�|�dStj�tj�|t����}|�rXt ��r�t jd���5}tj tj�|t��dd���}ddd��n #1swxYwYt|��tj ||j j��|jj�|��dSt jd���5}|jj�tj tj�|t��d�����ddd��n #1swxYwYt|��dSdS)z If scaler state exists, load it.NTr r�r�r�)rirTr r�rEr�r6r&r�r�rKr�r=r�r�rVrDr�)r�r>r?r,� scaler_states r�rzTrainer._load_scaler� s��� � � �F�!#������ � �Z��0U�0U�!V�!V�� !� 5�&�'�'� 5��,�D�9�9�9��_�#(�:��� � �Z��=�=�E�`d�$�$�$�L�������������������$�O�4�4�4��*�<���9I�J�J�J�� �'�7�7� �E�E�E�E�E��,�D�9�9�9��_��$�+�;�;�� �2�7�<�<� �K�#H�#H�W[�\�\�\����������������������$�O�4�4�4�4�4�! 5� 5s%�-;B4�4B8�;B8�#AF�F � F c�� �|jjsdSg}g}|jj|jgz}|jj���D�]T\� }t|t��s|g}t� fd�|D�����r� fd�|D��}t||��D]�\}}|� di��}|� di��} t|��d i|��} | ���D]\} } t| | | ���t|t��r| |_n|�| ��|j�t| ������t$�d����?|�� ����Vt)|��dkr1t$�dd �|���d ���|D]}|j�|���dS) zLIf callback states exist and were passed in, restore their states if enabledNc3�8�K�|]}|jj�kV��dSr�r9r:�r�r��stored_callbacks �r�rz/Trainer._load_callback_state.<locals>.<genexpr>� s.�����e�e�h�8�%�.�/�A�e�e�e�e�e�er�c�4��g|]}|jj�k�|��Sr�rJrKs �r�r�z0Trainer._load_callback_state.<locals>.<listcomp>� s1������!)�8�CU�C^�bq�Cq�Cq�H�Cq�Cq�Cqr�r�� attributeszPContinuing training from checkpoint, restoring any callbacks that were passed inrzPCheckpoint included callbacks not included in current configuration. Ignoring. (r�r�r�)r��'restore_callback_states_from_checkpointr`r�r~rwrr-r�r�rY�zipr�r�r/r8rtr�rrr?rCr�ra)r�� not_found� new_callbacks�original_callbacksro� duplicatesr�� callback_datar�rN� new_callback� attributerrLs @r�r!zTrainer._load_callback_state� sV����y�@� � �F�� �� �!�2�<�� �~�M��%)�Z�%B�%H�%H�%J�%J� 2� 2� !�O�T��d�D�)�)� ��v���e�e�e�e�Rd�e�e�e�e�e� 2�����-?���� �03�:�t�/D�/D� N� N�+�H�m�(�,�,�V�R�8�8�D�!.�!2�!2�<��!D�!D�J�#1�4��>�>�#9�#9�D�#9�#9�L�,6�,<�,<�,>�,>�@�@�(� �5�� �i��?�?�?�?�!�(�N�;�;�;�'3�� � �%�,�,�\�:�:�:��)�9�9�$�|�:L�:L�M�M�M�M�� � �n�o�o�o�o�� � ��1�1�1�1� �y�>�>�A� � � �N�N�z�cg�cl�cl�mv�cw�cw�z�z�z� � � �&� 9� 9�H� � !� .� .�x� 8� 8� 8� 8� 9� 9r���minimizer)r@�n_trials� direction�backendr r*c �J�|�t��}t|��}t|��}|���||_|j�t d���|�|jn||_||_ |�tn||_ |j |||fi|��} d|_| S)az Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided, the sum of all metrics otherwise. <Tip warning={true}> To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom optimizer/scheduler. </Tip> Args: hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*): A function that defines the hyperparameter search space. Will default to [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`] or [`~trainer_utils.default_hp_space_sigopt`] depending on your backend. compute_objective (`Callable[[Dict[str, float]], float]`, *optional*): A function computing the objective to minimize or maximize from the metrics returned by the `evaluate` method. Will default to [`~trainer_utils.default_compute_objective`]. n_trials (`int`, *optional*, defaults to 100): The number of trial runs to test. direction (`str` or `List[str]`, *optional*, defaults to `"minimize"`): If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics. If it's multi objectives optimization, direction is `List[str]`, can be List of `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or several metrics. backend (`str` or [`~training_utils.HPSearchBackend`], *optional*): The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending on which one is installed. If all are installed, will default to optuna. hp_name (`Callable[["optuna.Trial"], str]]`, *optional*): A function that defines the trial/run name. Will default to None. kwargs (`Dict[str, Any]`, *optional*): Additional keyword arguments for each backend: - `optuna`: parameters from [optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html) and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from [optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize) - `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run). If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available). If `progress_reporter` is not set in the `kwargs`, [ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used. - `sigopt`: the parameter `proxies` from [sigopt.Connection.set_proxies](https://docs.sigopt.com/support/faq#how-do-i-use-sigopt-with-a-proxy). Returns: [`trainer_utils.BestRun` or `List[trainer_utils.BestRun]`]: All the information about the best run or best runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray backend. NzXTo use hyperparameter search, you need to pass your model through a model_init function.) r rRr�ensure_availabler�r�r5�default_hp_spacer)r*rZr@r�) r�r)r@rZr[r\r*r�� backend_obj�best_runs r��hyperparameter_searchzTrainer.hyperparameter_search� s���@ �?�/�1�1�G�!�'�*�*��8��A�C�C� ��$�$�&�&�&�!(��� �?� "��j��� �9A�8H� �4�4�h�� ��� �>O�>W�!:�!:�]n���"�;�?�4��9�G�G��G�G��!%����r�r�r�c�r�|jj�|jj|d<|jjr-|jj|d<|�t d||jj���i|�d|jji�}|jj�|��|j � |j|j|j |��|_ dS)a8 Log `logs` on the various objects watching training. Subclass and override this method to inject custom behavior. Args: logs (`Dict[str, float]`): The values to log. start_time (`Optional[float]`): The start of training. Nrr3rK)rrj) rwrr�r1r3rdrU� log_historyrtr`�on_logr~)r�r�r�r�s r�rXz Trainer.log/s��� �:� � '� �J�,�D��M� �9� 2� `�,0�J�,L�D�(� )��%��g�z�d�j�>^�_�_�_�_�=�D�=�V�T�Z�%;�<�=�� � ��%�%�f�-�-�-��,�3�3�D�I�t�z�4�<�Y]�^�^�� � � r�roc�`��t|t��r6t|���fd�|���D����St|tt f��r$t|���fd�|D����St|t j��r�d�jj i}�j ret j |��st j |��r=|� d�jjjj���i��|jdi|��S|S)z| Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors. c�B��i|]\}}|��|����Sr���_prepare_input)r�rr"r�s �r�r/z*Trainer._prepare_input.<locals>.<dictcomp>Ks-���R�R�R�T�Q��q�$�"5�"5�a�"8�"8�R�R�Rr�c3�B�K�|]}��|��V��dSrrh)r�r"r�s �r�rz)Trainer._prepare_input.<locals>.<genexpr>Ms1�����C�C��d�1�1�!�4�4�C�C�C�C�C�Cr�r�r�r�)r�rr�r-r�r�r�r�r�r�rJ�is_floating_point� is_complexrurVrwr5r&r�r�)r�ror�s` r�rizTrainer._prepare_inputFs/��� �d�G� $� $� %��4��:�:�R�R�R�R�T�Z�Z�\�\�R�R�R�S�S� S� ��u�d�m� ,� ,� %��4��:�:�C�C�C�C�d�C�C�C�C�C� C� ��e�l� +� +� %��� � 0�1�F��(� g�e�.E�d�.K�.K� g�u�O_�`d�Oe�Oe� g�� � �w��(8�(>�(O�(\�(b�(b�(d�(d�e�f�f�f��4�7�$�$�V�$�$� $�� r�r}c���|�|��}t|��dkr+tdd�|j���d����|jjdkr|j� |j|d<|S)z� Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and handling potential state. rz�The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: rXrhN�mems)rir?r"r�rqr�r*r�r�r}s r�rkzTrainer._prepare_inputsXs��� �$�$�V�,�,�� �v�;�;�!� � ��n�IL���RV�Ri�Ij�Ij�n�n�n��� � �9� �1� $� $���)?�!�Z�F�6�N�� r�c�*�|���S)zF A helper wrapper to group together context managers. )�autocast_smart_context_managerr�s r��compute_loss_context_managerz$Trainer.compute_loss_context_managerhs���2�2�4�4�4r�rac��|jr,tjj�||j���}nt j��}|S)z� A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired arguments, depending on the situation. )rar�)rsr�r�r�rmr{r�r�)r�ra� ctx_managers r�rqz&Trainer.autocast_smart_context_managernsE�� � � 3��)�-�0�0�}�TX�Tb�0�c�c�K�K�$�0�2�2�K��r�c��|���t|jd��r2t|jj��r|j���|�|��}t ��r^t |||jj��}|� ��� ��� |jj ��S|� ��5|�|||���}ddd��n #1swxYwY~|jj��H|jj|jjzdk�r*t%��rt&j���n�t-��rt&j���n�t1��rt&j���n�t5��rt&j���nvt9d���rt&j���nGt=��rt>� d��nt&j!���i}|jj"tFj$tFj%fvr|�&��|d<|jj'd kr|�(��}|j)rItUj+||j��5}|�,��ddd��dS#1swxYwYdS|j-s|j.�||jjz }|j/j0tbj2krd |d <|j/j,|fi|��|� ��S) aq Perform a training step on a batch of inputs. Subclass and override to inject custom behavior. Args: model (`nn.Module`): The model to train. inputs (`Dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument `labels`. Check your model's documentation for all accepted arguments. Return: `torch.Tensor`: The tensor with training loss on this batch. rK�r{Nrz2.0)� min_versionzW`torch_empty_cache_steps` is set but HPU device/backend does not support empty_cache().r�r F� scale_wrt_gas)3rKrBr[rkrkr�r�r�r�� reduce_mean�detachr�r�rr� compute_loss�torch_empty_cache_stepsrwrUr�r��xpu� empty_cacher�r�r�r�r�r�r��mpsr�rrCr�r8rer�r�r�r�r�rrr�� scale_loss�backwardrXr�rVr7r�r8)r�r�r}r{�loss_mbr�r�� scaled_losss r�r:zTrainer.training_stepzs���( � � � � � � �4�>�7� +� +� #����9M�0N�0N� #� �N� � � "� "� "��%�%�f�-�-�� "� $� $� G�*�5�&�$�)�:_�`�`�G��&�&�(�(�/�/�1�1�4�4�T�Y�5E�F�F� F� � .� .� 0� 0� [� [��$�$�U�F�GY�$�Z�Z�D� [� [� [� [� [� [� [� [� [� [� [���� [� [� [� [� � �I� -� 9�� �&���)J�J�a�O�O�%�'�'� )�� �%�%�'�'�'�'�'�)�)� )�� �%�%�'�'�'�'�(�*�*� )�� �&�&�(�(�(�(�'�)�)� )�� �%�%�'�'�'�'�'�E�:�:�:� )�� �%�%�'�'�'�'�'�)�)� )����m������ �&�&�(�(�(��� �9�?�~�2�N�4J�K� K� K�&*�&=�&=�&?�&?�F�?� #� �9�?�Q� � ��9�9�;�;�D� �=� !����d�n�5�5� '���$�$�&�&�&� '� '� '� '� '� '� '� '� '� '� '� '���� '� '� '� '� '� '��1� D�d�6L�6T��d�i�C�C����0�O�4M�M�M�*/���'� %�D� � %�d� 5� 5�f� 5� 5� 5��;�;�=�=� s$�1D�D�D�9L�L�"Lc ��|j�|j�d|vr|�d��}nd}|jri}|�||d<i|�|�}|d i|��}|jjdkr||jj|_|��|j�|��}t|��r|j j � ��} n|� ��} |j�|�|||���} n�| tj��vr|�||d���} n�|�||��} n�t|t ��rdd|vr`t#d d �|������d d �|������d ����t|t ��r|dn|d} |jjr|js|jr|�| |jjz} |r| |fn| S)z� How the loss is computed by Trainer. By default, all models return the loss in the first element. Subclass and override for custom behavior. N�labelsr{rrvT)� shift_labelsr�zJThe model did not return a loss from the inputs, only the following keys: rXz,. For reference, the inputs it received are rhr�)r}r�r6rXr�r*rrVr)r�r�r�� _get_namer*r>r�r�r"r�r!�average_tokens_across_devices� num_processes) r�r�r}�return_outputsr{r�� loss_kwargs�outputsr�r�r�s r�r{zTrainer.compute_loss�sZ�� � � +�t�/E�/Q�W_�ci�Wi�Wi��Z�Z��)�)�F�F��F� � )� /��K�!�-�4F� �0�1�.��.�+�.�F��%�/�/�&�/�/�� �9� �1� $� $� ���!5�6�D�J� � �"�.�;�;�E�B�B�O��o�.�.� 9�,�7�=�G�G�I�I� � �,�6�6�8�8� ��%�1��-�-�g�v�Rd�-�e�e����@�G�I�I�I�I��*�*�7�F��*�N�N����*�*�7�F�;�;����'�4�(�(� �V�7�-B�-B� �x��x�x�� � ���/�/�x�x�]`�]e�]e�fl�fq�fq�fs�fs�]t�]t�x�x�x���� '1��$�&?�&?�O�7�6�?�?�W�Q�Z�D� �I� 3� 3��/� 3�37�3I� 3�#�.� �D�$�2� 2�D�"0�:��g���d�:r�c�"�|jjdkS)z� Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several machines) main process. r)r��local_process_indexr�s r�rzTrainer.is_local_process_zero�s�� �y�,��1�1r�c�l�t��rtj��dkS|jjdkS)z� Whether or not this process is the global main process (when training in a distributed fashion on several machines, this is only going to be `True` for one process). r)r�rvr�r�r�r�s r�rzTrainer.is_world_process_zeros2�� #� $� $� 0��8�:�:��?� "��9�*�a�/� /r�r�rSc�<�|� |jj}t��r|�|���n@t ��r�t j|d���|j���}|jj r|� ||���tr?tt j �|d��������n�|jr�dt#|jjjj��vrqt-jt0��t-jd��krB|j�|j��}|jj r|� ||���n�|jr� |j�|j��}|jj r|� ||���n�#t:$r�t<�d��|jj r|� |i���tA|jj |tBtDg��|j�#|��Yn%wxYw|jj r|� |��|jj$r|s|�$d � ��dSdSdS) z� Will save the model, so you can reload it using `from_pretrained()`. Will only save from the main process. NTr )rZr��FULL_STATE_DICTz0.24.1z| stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use zero_to_fp32.py to recover weightsz Model save)�commit_message)%r�r�r�� _save_tpur�rirjrUrZrh�_saverurrTr��touchrQr rVrwr�state_dict_typerr��accelerate_version�get_state_dictr�rJr+r"rrCrLrormr*rf)r�r�rSrZs r�rVzTrainer.save_models��� � ���-�J� !� #� #�# #� �N�N�:� &� &� &� &� $� &� &�! #� �K� �T� 2� 2� 2� 2��+�6�6�8�8�J��y�$� >�� � �:�*� �=�=�=�(� J��R�W�\�\�*�.?�@�@�A�A�G�G�I�I�I�� � !� #�!�S��)9�)?�)K�)[�%\�%\�\�\�� �0�1�1�G�M�(�4K�4K�K�K�!�-�<�<�T�Z�H�H� ��9�(�B��J�J�z�j�J�A�A�A�� � &� #� ?�!�-�<�<�T�^�L�L� ��9�(�B��J�J�z�j�J�A�A�A���� ?� ?� ?����:�����9�(�:��J�J�z�b�J�9�9�9�'�� �(=�z�L�Zk�Kl�m�m�m��"�2�2�:�>�>�>�>�>� ?�����Y� "� #� �J�J�z� "� "� "� �9� � :�� :� � � �L� � 9� 9� 9� 9� 9� :� :� :� :s�AG�BI�Ic �D�|�|n |jj}t�d|����|j}t j��t jd���rStj |d���tj |jtj � |t����tf}t jd��|j�r�|���|���d�}tj � |d|jj�d |jj�d t,����}t j ||d� ��t jd ��|jjr�d dlm}|tj � |d��dt,��d���\}}|jj}|j�|��} t;| |��r/| �||t j |jj����n�t�d��t j |tj � |t,�����n�t;||���st;|j�|��|��ru|j�|���||jjt j |�����t j |jj���n�t�d��t j |�����} t j | tj � |t,����n\|�||jjt j |jjt j |��������|j!�(|jjr|j!�|��dSdSdS)N�Saving model checkpoint to F)�localTr �saving_checkpoint)r�rr�rrcr�save_full_checkpointsr)�%consolidate_sharded_model_checkpointsr�z rank*-of-*-)� ckpt_prefix� ckpt_suffixrV)rZ� save_function�safe_serialization�ETrainer.model is not a `PreTrainedModel`, only saving its state dict.)�is_main_processrZr�r�)r�r�r�rZ)"r�r�rrr�r�r��is_master_ordinalrirjr�rYrTr��TRAINING_ARGS_NAMErrrTr�rZr%r�rrorhr�r�rGrVr)r��save_pretrainedr��_maybe_convert_to_cpur�) r�r�r��supported_classes�ckpt� ckpt_pathr��full_state_dictrbr�rZs r�r�zTrainer._save_tpuDs���#-�#9�Z�Z�t�y�?S� �� � �>�*�>�>�?�?�?�� �� � ���� � �e� ,� ,� ,� P� �K� �T� 2� 2� 2� 2� �J�t�y�"�'�,�,�z�;M�"N�"N� O� O� O�,�-�� � �)�*�*�*� � &�5 ��)�)�+�+�"'�":�":�"<�"<���D��� � ��e�4�9�#:�e�e�� �@T�e�e�Wc�e�e���I� �G�D�)�� 7� 7� 7� 7� �M�1� 2� 2� 2��y�$� U�\�\�\�\�\�\�%J�%J� "�� � �Z�� <� <� <�l� <� <�$�&�&�&�"��� � �+��"&�"2�"?�"?��"F�"F���o�/@�A�A� U�#�3�3�"�#2�&(�g�+/�9�+E� 4������K�K� g�h�h�h��G�O�R�W�\�\�*�l�-S�-S�T�T�T���E�#4�5�5� ��$�*�7�7��>�>�@Q�R�R� L�� �-�-�e�4�4�D�D��$(�I�$9�!�7��8H�8H�8J�8J�K�K�"$�'�'+�y�'A� E������ � �c�d�d�d��5�e�6F�6F�6H�6H�I�I� ��� �B�G�L�L��\�$J�$J�K�K�K�K� � !� !�� $� � 5� �g�#'�9�#=��3�E�4D�4D�4F�4F�G�G� "� � � � � � ,���1F� ,� � !� 1� 1�*� =� =� =� =� =� -� ,� ,� ,r�c�,�|�|n |jj}tj|d���t�d|����t ��stfn ttf}t|j |���s/|�|j � ��}t|j � |j ��|��r@|j � |j ���|||jj���n�t�d��|jjrHt j�|tj�|t*��ddi���n`t#j|tj�|t.����n'|j �|||jj���|j�|j�|��na|j�Zt5|jd ��rE|jj�9t�d ��|jj�|��t#j|jtj�|t8����dS) NTr r�)rZr�r�r��pt)r�r�zWSaving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`)r�r�rirjrrr�r'r�r�r�rZrVr)r�r�r�r�� save_filerTr�rmrYror�r�rBr�r�)r�r�rZr�s r�r�z Trainer._save�s`��#-�#9�Z�Z�t�y�?S� � � �J��.�.�.�.�� � �>�*�>�>�?�?�?�6G�6I�6I�k�_�.�.�P_�aj�Ok���$�*�&7�8�8� ��!�!�Z�2�2�4�4� ��$�*�7�7�� �C�C�EV�W�W� S�� �-�-�d�j�9�9�I�I��:�$�)�Jd�J������ � �c�d�d�d��9�-�S��%�/�/�"�B�G�L�L��=N�$O�$O�[c�ei�Zj�0������J�z�2�7�<�<� �L�+Q�+Q�R�R�R�R� �J� &� &��z�d�i�F`� '� � � � � � ,� � !� 1� 1�*� =� =� =� =� � � *���*�K�8�8� +��"�,�8� �K�K�q� r� r� r� � � (� 8� 8�� D� D� D� � �4�9�b�g�l�l�:�7I�J�J�K�K�K�K�Kr�c�>�|jjtjkra|jxjt |jg|jj���� ��� ��z c_d|_dS|jxj|jz c_d|_dS)Nrr) r�rKrfrLrwr rArr�r=rr�s r�rWzTrainer.store_flos�s��� �9� "�l�&>� >� >� �J� !� !�-�t�/@�.A�$�)�JZ�[�[�[�_�_�a�a�f�f�h�h� � !� !�!"�D� � � � �J� !� !�T�%6� 6� !� !� !�D� � � r�c�F�g}d�t|���|�d���D��}|D]�}|r5|�tj�|��|f���9t jd|�d�|��}|�P|����<|�t|���d��|f����t|��}d�|D��}|j j ��tt|j j ����|vry|�tt|j j ������} t| t!|��dz ��D]} || dz|| c|| <|| dz<� |S) Nc�j�g|]0}tj�|���!t|����1Sr�)rirTr�r )r��xs r�r�z/Trainer._sorted_checkpoints.<locals>.<listcomp>�s6��p�p�p�q�_a�_f�_l�_l�mn�_o�_o�p�C��F�F�p�p�pr��-*z.*z -([0-9]+)rc��g|] }|d�� S)r r�)r�r>s r�r�z/Trainer._sorted_checkpoints.<locals>.<listcomp>�s��Q�Q�Q� �j��m�Q�Q�Qr�r�r )rr<rtrirT�getmtime�re�match�groupsr��sortedrwrSr �indexr�r?) r�r��checkpoint_prefixr �ordering_and_checkpoint_path�glob_checkpointsrT� regex_matchr��best_model_indexr|s r�rZzTrainer._sorted_checkpoints�s���(*�$�p�p�D��,<�,<�,A�,A�EV�BZ�BZ�BZ�,[�,[�p�p�p��$� ^� ^�D�� ^�,�3�3�R�W�5E�5E�d�5K�5K�T�4R�S�S�S�S� �h�'H�,=�'H�'H�'H�$�O�O� ��*�{�/A�/A�/C�/C�/O�0�7�7��[�=O�=O�=Q�=Q�RS�=T�9U�9U�W[�8\�]�]�]��#�$@�A�A��Q�Q�>P�Q�Q�Q�� �J� ,� 8��D���9�:�:�;�;�?Q�Q�Q�1�7�7��D���Aa�<b�<b�8c�8c�d�d� ��+�S�1C�-D�-D�q�-H�I�I� t� t��CU�VW�Z[�V[�C\�^p�qr�^s�@�"�1�%�'9�!�a�%�'@�'@�!�!r�c���|jj�|jjdkrdS|�||���}t|��|jjkrdS|jj}|jj�(|jjdkr|d|jjkrd}t dt|��|z ��}|d|�}|D]6}t�d|�d���tj |d� ���7dS) Nrr r rr�rrTr) r�r[rZr?rwrSr�rrr]r^)r�r r�r�r[�number_of_checkpoints_to_delete�checkpoints_to_be_deletedr>s r�rzTrainer._rotate_checkpoints�s�� �9� %� -���1K�q�1P�1P� �F�"�5�5� �V`�5�a�a�� �!� "� "�d�i�&@� @� @� �F� �9�5�� �J� ,� 8�� �*�a�/�/�"�2�&�$�*�*J�J�J� � �*-�a��5G�1H�1H�K[�1[�*\�*\�'�$6�7W�8W�7W�$X�!�3� :� :�J� �K�K�`�j�`�`�`� a� a� a� �M�*�D� 9� 9� 9� 9� 9� :� :r�rr��metric_key_prefixc ��|du}|r|n|j}t|t��rTi}|���D];\}}|�|r|n|||�d|�����}|�|���<|S|j���|�|��} |j rt| ��} tj ��} |j j r|jn|j} | | d|j�dnd||���} |j j|j jz} |�d�| jvr| | j|�d�z } |�d�| jvr| | j|�d�z } | j�t)|| | jt-j| j| z ��� ����|�| j��t2j|j jvr%t9jt=j����|j �!|j |j"|j#| j��|_#|j�$| j��| jS) a� Run evaluation and returns metrics. The calling script will be responsible for providing a method to compute metrics, as they are task-dependent (pass it to the init `compute_metrics` argument). You can also subclass and override this method to inject custom behavior. Args: eval_dataset (Union[`Dataset`, Dict[str, `Dataset`]), *optional*): Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the `__len__` method. <Tip> If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run separate evaluations on each dataset. This can be useful to monitor how training affects other datasets or simply to get a more fine-grained evaluation. When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the loss on `data1` and `metric_for_best_model="eval_data2_loss"` for the loss on `data2`. </Tip> ignore_keys (`List[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"eval"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "eval_bleu" if the prefix is "eval" (default) Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The dictionary also contains the epoch number which comes from the training state. Nrb)r�r�r�� EvaluationT)r��prediction_loss_onlyr�r��_jit_compilation_time�_model_preparation_time�rr )%r�r�r�r-r�rur/r0rr�r%r�r�r��prediction_loop�evaluation_loopr�rrr9rdr�math�ceilrXrrNr?r�rOrPrQr`� on_evaluaterwr~r�)r�r�r�r��overrider9�eval_dataset_name� _eval_dataset�dataset_metricsr r�� eval_loopr��total_batch_sizes r�r�zTrainer.evaluate�s���Z �t�+��'/�F�|�|�T�5F� � �l�D� )� )� ��G�4@�4F�4F�4H�4H� 0� 0�0�!�=�"&�-�-�2:�!Q���@Q� +�):�&P�&P�=N�&P�&P�#0�#�#�� ����/�/�/�/��N� ��"�"�$�$�$��2�2�<�@�@�� � &� C�1�/�B�B�O��Y�[�[� �,0�I�,P�j�D�(�(�VZ�Vj� ��� �$�*.�)=�)E���4�#�/� � � �� �9�4�t�y�7K�K��� 6� 6� 6�&�.� H� H� �&�.�,=�)T�)T�)T�U� U�J�� 8� 8� 8�F�N� J� J� �&�.�,=�)V�)V�)V�W� W�J����� �!��"�.��)�F�$6�9I�$I�J�J�  � � � � � � ����� � � � � (�D�I�O� ;� ;� �O�C�.�0�0� 1� 1� 1��,�8�8���D�J�PT�P\�^d�^l�m�m�� � ��4�4�V�^�D�D�D��~�r�rc ��|j���|�|��}tj��}|jjr|jn|j}||d||���}|jj|jj z}|�d�|j vr||j |�d�z }|�d�|j vr||j |�d�z }|j � t|||j tj|j |z �������|j�|j|j|j|j ��|_|j�|j ��t+|j|j|j ���S)a� Run prediction and returns predictions and potential metrics. Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method will also return metrics, like in `evaluate()`. Args: test_dataset (`Dataset`): Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the `model.forward()` method are automatically removed. Has to implement the method `__len__` ignore_keys (`List[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. metric_key_prefix (`str`, *optional*, defaults to `"test"`): An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named "test_bleu" if the prefix is "test" (default) <Tip> If your predictions or labels have different sequence length (for instance because you're doing dynamic padding in a token classification task) the predictions will be padded (on the right) to allow for concatenation into one array. The padding index is -100. </Tip> Returns: *NamedTuple* A namedtuple with the following keys: - predictions (`np.ndarray`): The predictions on `test_dataset`. - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained labels). � Prediction)r�r�r�r�r�r�)� predictionsr�r9)r/r0rr�r�r�r�r�rrr9rurdrr�r�r`� on_predictrwr~r�rTr�r�) r�rr�r��test_dataloaderr�r�r�r�s r��predictzTrainer.predict^s���H ��"�"�$�$�$��2�2�<�@�@���Y�[�[� �,0�I�,P�j�D�(�(�VZ�Vj� ��� ��;�bs� � � �� �9�4�t�y�7K�K��� 6� 6� 6�&�.� H� H� �&�.�,=�)T�)T�)T�U� U�J�� 8� 8� 8�F�N� J� J� �&�.�,=�)V�)V�)V�W� W�J����� �!��"�.��)�F�$6�9I�$I�J�J�  � � � � � ��,�7�7�� �4�:�t�|�]c�]k�l�l�� � ��4�4�V�^�D�D�D��F�,>�&�JZ�dj�dr�s�s�s�sr�r�c ��|j}|�|n|j}|jr|j�t |dd���\}}|�|jd|���}t|jj ��dkr�||jur�tj ��} |js|j r*|jj dkr|j� |��n|j�|d���}ttj ��| z d ��|_|j r||_||jur||_|jr |j|_|js[|jr'|�t*j|j� ��}n-|jr&|�t*j|j� ��}|jj} t6�d |�d ���t;|��r1t6�d |�|������nt6�d��t6�d| ����|���tA|j!d��r2tE|j!j��r|j!���||j#_$tK|dd��} |j&dkrd|_'tQ|jj)d���} tQ|jj)d���} tQ|jj)d���}tQ|jj)d���}d}i}d}tU|��D�]P\}}tW|��}|� ||z }| �|} |�,||||���\}}}tK|jdd��}d|j-vr|�.||��nd}t_��rtaj1��|�=|�2|�3| ����}| �4|��|�Y|j�5|dd���}|�2|��}|jj6r|dkr|�4|��|�|j�5|dd���}|�v|j�5|dd���}|j7�|�7||��}|�2|��}|jj6r|dkr| �4|��|�<|�2|��}|jj6r|dkr|�4|��|j#�8||j9|j:��|_:|jj6r�|j;�[|�Y|�W|jj<j=}i}d|j-vr|nd|d<d|j-vr|nd|d<|�;t}d(||d�|��|���}~~~~t*j?�@�����|jA��|dz|jAzdkrr| �B��| �B��|�B��|�B��~~~~t*j?�@����R|jjC|_2|j&r tA|d��rt�|d��| �E��} | �E��} |�E��}|�E��}t;| ��rt| ��}nYt�| t���rtK| d d��dkr| j}n't;|��r|�|��}n|}|dkr|dkr|}|j;�S| �Q|�O|jj6sCd|j-vr| nd|d<d|j-vr|nd|d<|�;t}d(| |d�|����}n|�i}t�|��}t�| t���rA| r?t�jK| ���L���M��||�d!�<nFt�| t�jN��r,| �L���M��||�d!�<tA|d"��r |jO||�d#�<tA|d$��r |j||�d%�<t�|�P����D]7}|�Q|�d&���s|�R|��||�d&|��<�8t�| |||�'��S))�� Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. Works both with or without labels. NrT�r� inferenceF�r�rr���evaluation_moder��r�r�� ***** Running � *****r�z Num examples: Unknown� Batch size = rr������� padding_indexr�rrr}r )�dim� pad_indexr�r��losses�r�r��r�rr�_lossr�r��model_preparation_timer�rb�r�r�r9rr�)Tr�r�rJr+r"r�r�r?rV�_modelsr�rQrr�� prepare_modelr�r�rUr,rNr�r��float16r�rOrzrrrr_rrrBr[rkr`r r;r*rr;�eval_do_concat_batchesr.rC�prediction_step�include_for_metricsrir�r�r��gather_function�repeatr��pad_across_processesr r��on_prediction_steprwr~r�r/�end_of_dataloaderrQr�r~�eval_accumulation_steps�to_cpu_and_numpy�gather_for_metricsrR� get_arraysr�r<r[r�r�� concatenater�rr�r�r!r�r6rP)r�rr�r�r�r�r�rbr�r�r�r�� all_losses� all_preds� all_labels� all_inputsr9�eval_set_kwargs�observed_num_examplesrjr}�observed_batch_sizer�r�r�r� inputs_decode� is_last_step� batch_kwargsrrs r�r�zTrainer.evaluation_loop�s ���y��7K�7W�3�3�]a�]v�� � $� N���)?�!�$�1��M�M�M�D�A�q�� � ���e� � �S�S�� �t��'� (� (�A� -� -�%�4�:�2E�2E�����J��,�Q�15�1E�Q�JN�JZ�Jj�ns�Js�Js�� �(�(��/�/�/��%�3�3�E�4�3�P�P� � +0�� � � �j�0H�!�*L�*L�D� '��#� #�"�� ��D�J�&�&�%*��"��(� 4�!%�!3����� K��"� K����u�}�T�[��I�I����$� K����u�~�d�k��J�J���Y�.� �� � �:�{�:�:�:�;�;�;� �j� !� !� 3� �K�K�K�D�,=�,=�j�,I�,I�K�K� L� L� L� L� �K�K�1� 2� 2� 2�� � �2�j�2�2�3�3�3� � � � � � � �4�>�6� *� *� "�x���8K�/L�/L� "� �N� � � !� !� !�0:���-��z�9�d�;�;� � �?�a� � ��D�J�'�t�y�'G�W[�\�\�\� �%�d�i�&F�VZ�[�[�[� �&�t�y�'G�W[�\�\�\� �&�t�y�'G�W[�\�\�\� �����!"��&�j�1�1�C )�C )�L�D�&�"1�&�"9�"9� �"�.�%�)<�<�%��%�!4�J�&*�%9�%9�%��I]�kv�%9�%w�%w� "�F�F�F�%�d�j�2C�[�Q�Q�O�@H�D�Ld�@d�@d��#�#�F�?�$;�<�<�<�jn� �&�'�'� �� �����!��-�-�f�m�m�J�.G�.G�H�H�����v�&�&�&��(� $� 0� E� E�m�YZ�fj� E� k� k� � $� 4� 4�]� C� C� ��y�3�2�{�l�7R�7R��N�N�=�1�1�1��!��)�>�>�v�1�X\�>�]�]���!��)�>�>�v�1�X\�>�]�]���5�A�!�?�?���O�O�F��-�-�f�5�5���y�3�*�{�l�7R�7R��M�M�&�)�)�)��!��-�-�f�5�5���y�3�+�{�l�7R�7R��N�N�6�*�*�*��0�C�C�D�$�*�VZ�Vb�c�c�D�L��y�+� )��'�3��8J�v�Oa�#'�#3�#B�#T�L�#%�L�7=��AY�7Y�7Y�V�V�_c�L��*�7?�4�C[�7[�7[�V�V�ae�L��*�"�2�2�&�\�6�V�\�\�|�\�\�'3�3���G� �F�F�F�� �&�&�(�(�(�(��-�9�t�a�x�4�Kg�>g�kl�>l�>l��+�+�-�-�-��*�*�,�,�,��+�+�-�-�-��+�+�-�-�-��F�F�F�� �&�&�(�(�(�� $�/�B��� �?� #�w�t�W�5�5� #� �D�'� "� "� "� �*�*�,�,� ��(�(�*�*� ��*�*�,�,� ��*�*�,�,� � �l� #� #� 4��l�+�+�K�K�� �&:� ;� ;� 4�� �Vd�fg�@h�@h�kl�@l�@l�&�3�K�K��*�%�%� 4�"�/�/� �;�;� � �3� � �!� � � 5�� 9� 9�/�K� � � ,��%��&��I�0�'�7=��@X�6X�6X� � �^b�O�H� %�6>�$�BZ�6Z�6Z� � �`d�O�H� %��*�*��^�9� �^�^�o�^�^���G�G��_��G�(��0�0�� �j�$� '� '� L�J� L�35�>�*�3M�3M�3R�3R�3T�3T�3Y�3Y�3[�3[�G�(�/�/�/� 0� 0� � �B�J� /� /� L�3=�?�?�3D�3D�3I�3I�3K�3K�G�(�/�/�/� 0� �4�/� 0� 0� ]�CG�C\�G�(�?�?�?� @� �4�1� 2� 2� a�EI�E`�G�(�A�A�A� B��� � ���'�'� I� I�C��>�>�%6�"9�"9�"9�:�:� I�8?� � �C�8H�8H��,�4�4�s�4�4�5���)�z�SZ�hs�t�t�t�tr�c�(�|�dSt��r|�d}t||��}njt��rt|��}nL|jj�|jjjdks|jj�|jjdkrt|��}|S)�� Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before concatenating them to `gathered` N� nested_gather�NOr) r�rJr�r�r��distributed_stater7� local_rankrB�r��tensors�names r�r�zTrainer._nested_gatherts��� �?� �F� !� #� #� 2��|�&��,�W�d�;�;�G�G� $� &� &� 2� ��)�)�G�G��i�)�5�$�)�:U�:f�jn�:n�:n� �I� '� /�D�I�4H�B�4N�4N�(��1�1�G��r�c�����t|j��dkrdnt�fd�|jD����}��dd��}|�|j}t|j��dkr|rdnd}|�������4t |jd��rt|jj dd g���ng�|s|rItt�fd �|jD������}t|��d kr|d}nd}tj ��5t���rt|���} |s|r�t!| t"��r6| d } t�fd �| ���D����} n| d} | d d�} | ���������} t-| ��} �n�d} t!| t"��r.t�fd�| ���D����} n| } t-| ��} �nd|s|r�|���5|�|�d���\} }ddd��n #1swxYwY| ������} t!|t"��r.t�fd�|���D����} n�|d d�} n�d} |���5|di���}ddd��n #1swxYwYt!|t"��r.t�fd�|���D����} n|} |jjdkr||jjd z |_ddd��n #1swxYwY|r| ddfSt| ��} t| ��d kr| d} | | |fS)a Perform an evaluation step on `model` using `inputs`. Subclass and override to inject custom behavior. Args: model (`nn.Module`): The model to evaluate. inputs (`Dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. The dictionary will be unpacked before being fed to the model. Most models expect the targets under the argument `labels`. Check your model's documentation for all accepted arguments. prediction_loss_only (`bool`): Whether or not to return the loss only. ignore_keys (`List[str]`, *optional*): A list of keys in the output of your model (if it is a dictionary) that should be ignored when gathering predictions. Return: Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and labels (each being optional). rFc3�F�K�|]}��|��duV��dSr�r�)r�rr}s �r�rz*Trainer.prediction_step.<locals>.<genexpr>�s4�����Av�Av�`a�&�*�*�Q�-�-�W[�B[�Av�Av�Av�Av�Av�Avr�� return_lossNTr��keys_to_ignore_at_inference�past_key_valuesc3�B�K�|]}��|��V��dSrr)r�rr}s �r�rz*Trainer.prediction_step.<locals>.<genexpr>�s/�����(W�(W�d����D�)9�)9�(W�(W�(W�(W�(W�(Wr�r r�c3�2�K�|]\}}|�dgzv� |V��dS�r�Nr��r�rr"r�s �r�rz*Trainer.prediction_step.<locals>.<genexpr>�s;�����)o�)o���1�q�Xc�gm�fn�Xn�On�On�!�On�On�On�On�)o�)or�c3�*�K�|] \}}|�v� |V��dSrr�rs �r�rz*Trainer.prediction_step.<locals>.<genexpr>�s2�����)d�)d���1�q�Xc�Oc�Oc�!�Oc�Oc�Oc�Oc�)d�)dr�)r�c3�2�K�|]\}}|�dgzv� |V��dSrr�rs �r�rz*Trainer.prediction_step.<locals>.<genexpr>�s;�����&h�&h�T�Q���Q\�`f�_g�Qg�Hg�Hg�q�Hg�Hg�Hg�Hg�&h�&hr�c3�*�K�|] \}}|�v� |V��dSrr�rs �r�rz*Trainer.prediction_step.<locals>.<genexpr>�s2�����&]�&]�T�Q���Q\�H\�H\�q�H\�H\�H\�H\�&]�&]r�r�)r?r��allr�rsrkrBr�r;r�rHr�r�rnr�r�r�r�r-ryrzr�r�rrr{r�r�r*r)r�r�r}r�r�� has_labelsr�loss_without_labelsr�� raw_outputsr�� logits_mbr�r�r�s ` ` r�r�zTrainer.prediction_step�s����<"�$�"2�3�3�q�8�8�U�U�c�Av�Av�Av�Av�ei�eu�Av�Av�Av�>v�>v� ��j�j���5�5� � � ��.�K�&)�$�*:�&;�&;�q�&@�&@�[�&@�d�d�V[���%�%�f�-�-�� � ��t�z�8�,�,� !�%�d�j�&7�9V�Yj�Xk�l�l� � � � � � �,� �"�5�(W�(W�(W�(W�d�FV�(W�(W�(W�#W�#W�X�X�F��6�{�{�a���������F� �]�_�_�( G�( G�&�(�(�' G�.�u�f�=�=� ��:�!4�:�!�+�t�4�4�4�"-�f�"5��$)�)o�)o�)o�)o� �8I�8I�8K�8K�)o�)o�)o�$o�$o� � �"-�a�.��$/����O� �"�.�.�0�0�7�7�9�9�=�=�?�?�D�.�y�9�9�F�F��D�!�+�t�4�4�0�$)�)d�)d�)d�)d� �8I�8I�8K�8K�)d�)d�)d�$d�$d� � �$/� �.�y�9�9�F�F��G�!4�G��:�:�<�<�^�^�(,�(9�(9�%��X\�(9�(]�(]� ��g�^�^�^�^�^�^�^�^�^�^�^����^�^�^�^��;�;�=�=�-�-�/�/�D�!�'�4�0�0�-�!&�&h�&h�&h�&h�W�]�]�_�_�&h�&h�&h�!h�!h���!(��������D��:�:�<�<�2�2�"'�%�/�/�&�/�/��2�2�2�2�2�2�2�2�2�2�2����2�2�2�2�!�'�4�0�0�)�!&�&]�&]�&]�&]�W�]�]�_�_�&]�&]�&]�!]�!]���!(���y�+�q�0�0�%,�T�Y�-A�A�-E�%F�� �Q( G�( G�( G�( G�( G�( G�( G�( G�( G�( G�( G����( G�( G�( G�( G�T � &��$��%� %��v�&�&�� �v�;�;�!� � ��A�Y�F��f�f�%�%s]�'D:N9�!J �= N9� J � N9�J �B N9� L3�' N9�3L7 �7N9�:L7 �;A2N9�9N=�N=c�d�t|jd��r|j�|��SdS)a� For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point operations for every backward + forward pass. If using another model, either implement such a method in the model or subclass and override this method. Args: inputs (`Dict[str, Union[torch.Tensor, Any]]`): The inputs and targets of the model. Returns: `int`: The number of floating-point operations. r>r)rBr�r>ros r�r>zTrainer.floating_point_ops�s4�� �4�:�3� 4� 4� ��:�0�0��8�8� 8��1r��tokenc�D�|���sdS|jj�1t|jj�����j}n |jj}|�|n |jj}t|||jj d���}|j |_d|_ dS)zE Initializes a git repo in `self.args.hub_model_id`. NT)r(�privater ) rr�rerr��absoluter� hub_tokenr �hub_private_repo�repo_id�push_in_progress)r�r(� repo_name�repo_urls r�rgzTrainer.init_hf_repos��� �)�)�+�+� � �F� �9� !� )��T�Y�1�2�2�;�;�=�=�B�I�I�� �.�I��*���� �0C���y��t�y�?Y�dh�i�i�i��$�,��� $����r��language�license�tagsr��finetuned_from�tasks� dataset_tags� dataset_argsc �:�|���sdStj�|jjd��} d} tj�| ��r�tj| ��j � d��} | dk} tj| ��j j } |�8| �6t|t��r|g}| D]}||vr|�|���tj|||||||||| �� � }|���}t%| d��5}|�|��ddd��n #1swxYwY| r>|j�|j���|jj��dSdS)a Creates a draft of a model card using the information available to the `Trainer`. Args: language (`str`, *optional*): The language of the model (if applicable) license (`str`, *optional*): The license of the model. Will default to the license of the pretrained model used, if the original model given to the `Trainer` comes from a repo on the Hub. tags (`str` or `List[str]`, *optional*): Some tags to be included in the metadata of the model card. model_name (`str`, *optional*): The name of the model. finetuned_from (`str`, *optional*): The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo of the original model given to the `Trainer` (if it comes from the Hub). tasks (`str` or `List[str]`, *optional*): One or several task identifiers, to be included in the metadata of the model card. dataset_tags (`str` or `List[str]`, *optional*): One or several dataset tags, to be included in the metadata of the model card. dataset (`str` or `List[str]`, *optional*): One or several dataset identifiers, to be included in the metadata of the model card. dataset_args (`str` or `List[str]`, *optional*): One or several dataset arguments, to be included in the metadata of the model card. Nz README.mdF� library_namer�) r2r3r4r�r5r6r7r�r8�w)rrirTr�r�r�r�r r�ror�r4r�r rtr&� from_trainer� to_model_card�open�writerVr)r��create_or_update_model_card)r�r2r3r4r�r5r6r7r�r8�model_card_filepath�is_peft_libraryr:� existing_tagsr��training_summary� model_cardr�s r��create_model_cardzTrainer.create_model_cards���J�)�)�+�+� � �F� �g�l�l�4�9�+?��M�M���� �7�>�>�-� .� .� )�$�>�*=�>�>�C�G�G��W�W�L�*�f�4�O�&�N�+>�?�?�D�I�M���M�$=��d�C�(�(�"� �6�D�(�)�)�C��$���� � �C�(�(�(��*�7� ����!�)��%��%�  �  �  ��&�3�3�5�5� � �%�s� +� +� �q� �G�G�J� � � � � � � � � � � � � � ���� � � � � � h� � � )� )�$�*� 5� 5� Q� Q�RV�R[�Rf� g� g� g� g� g� h� hs�.E�E�Ec ���|���r|jjtjkrdS|jjs"|j�|j���sdS|jj}tttg}ttfD]�}tj�||��}tj�|��r�|�|��t'|��5}t)j|�����}ddd��n #1swxYwYt/t1|d�������}|�|����t7��r'|�t8t:t<g��|D]�} tj�tj�|| ����rQt?j tj�|| ��tj�|| ������|j!�|j!�"|��tGj$|jtj�|tJ����|jj&tNj(krd|j)j*��} ndtW|j)j,����} t[|j.|| |jj/ddt`�d�g���} | g} |jjtj1tj2fvrl|jjtj1krdntg|��j4} t[|j.|| | d z|jj/d� ��}| �|��|j�|j���rtk| ��|_dS|jj6�| ��dS) N� weight_mapzTraining in progress, step zTraining in progress, epoch Tr1r�)r.� folder_pathr�r(� run_as_future�ignore_patternszlast-checkpointz , checkpoint)r.rI� path_in_repor�r(rJ)7rr�� hub_strategyrS�END�hub_always_pushr/�is_doner�rkrormrnrlrirTr�r rtr>�json�loads�readr�r=r>�extendr�rhrjrir]r?r�r�r�rYr�r$rVrrwrUr�rrrer,rN� CHECKPOINT�ALL_CHECKPOINTSrrrq�jobs)r�rr��modeling_files� index_file� index_pathr�r�� shard_files� modeling_filer��model_push_job� push_jobsrL�checkpoint_pushs r�r zTrainer._push_from_checkpoint^s����)�)�+�+� �t�y�/E���/X�/X� �F��y�(� �T�-B�-N�W[�Wl�Wt�Wt�Wv�Wv�-N� �F��Y�)� �%�|�5F�G��-�/F�G� 3� 3�J�����&7��D�D�J��w�~�~�j�)�)� 3��%�%�j�1�1�1��*�%�%�1�� �J�q�v�v�x�x�0�0�E�1�1�1�1�1�1�1�1�1�1�1����1�1�1�1�"�3�u�\�':�'A�'A�'C�'C�#D�#D�E�E� ��%�%�k�2�2�2�� � � � j� � !� !�#6�8L�Ng�"h� i� i� i�+� u� u�M��w�~�~�b�g�l�l�+<�m�L�L�M�M� u�� �B�G�L�L�):�M�J�J�B�G�L�L�Yc�er�Ls�Ls�t�t�t�� � � ,� � !� 1� 1�*� =� =� =� � �4�9�b�g�l�l�:�7I�J�J�K�K�K� �9� "�l�&8� 8� 8�S�4�:�;Q�S�S�N�N�S�C�� �@P�<Q�<Q�S�S�N�&��%�"�)��)�%��!�&;�#?�#?�#?�@�  � � ��$�$� � �9� !�k�&<�k�>Y�%Z� Z� Z�%)�Y�%;�{�?U�%U�%U�!�!�[_�`q�[r�[r�[w� �,��)�-�)�-��>��i�)�"� ���O� � � �_� -� -� -� � � (�D�,A�,I�,I�,K�,K� (�$2�9�$=�$=�D� !� !� !� � !� &� -� -�i� 8� 8� 8� 8� 8s�2'D%�%D) �,D) c���t|d��sdS|j�N|j���s7t�d��|j���dSdSdS)Nr/z\Waiting for the current checkpoint push to be finished, this might take a couple of minutes.)rBr/rPrr�wait_until_doner�s r�r_zTrainer._finish_current_push�ss���t�/�0�0� � �F� � � ,�T�5J�5R�5R�5T�5T� ,� �K�K�v� w� w� w� � !� 1� 1� 3� 3� 3� 3� 3� -� ,� ,� ,r��End of trainingr��blocking�revisionc �6�|�dd��}|�\|jjrP|jj�t |jj��j}n%|jj�d��d}|�|n |jj}|j�|� |���|� d���|� ��sdSt|j dd���dd |vrg|d <t|d t��r |d g|d <|j jD]'}||d vr|d �|���(|jd d|i|��|���t)|j|jj||| d t*�d �g|� ��S)u Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`. Parameters: commit_message (`str`, *optional*, defaults to `"End of training"`): Message to commit while pushing. blocking (`bool`, *optional*, defaults to `True`): Whether the function should return only when the `git push` has finished. token (`str`, *optional*, defaults to `None`): Token with write permission to overwrite Trainer's original args. revision (`str`, *optional*): The git revision to commit from. Defaults to the head of the "main" branch. kwargs (`Dict[str, Any]`, *optional*): Additional keyword arguments passed along to [`~Trainer.create_model_card`]. Returns: The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the progress of the commit if `blocking=True`. r�N�/r)r(TrR� model_tagsr4r1r�)r.rIr�r(rJrKrdr�)r6r�rhrerr�rr�r,rgrVrr;r�r�r rgrtrFr_rrN)r�r�rcr(rdr�r�� model_tags r�rfzTrainer.push_to_hub�s���6�Z�Z� �d�3�3� � � �$�)�"7� ��y�%�-�!�$�)�"6�7�7�<� � �!�Y�3�9�9�#�>�>�r�B� ��*���� �0C�� � � $� � � �E� � *� *� *� ���t��,�,�,��)�)�+�+� � �F� �4�:�|�T� 2� 2� >��V�#�#�!#��v���&��.�#�.�.� 2�"(��.�!1��v��!�Z�2� 5� 5� ��F�6�N�2�2��6�N�)�)�)�4�4�4�����?�?�*�?��?�?�?� �!�!�#�#�#���%�� �,�)��&�,�!�&;�#?�#?�#?�@�� � � � r�c ��|j}t|��std���|�|n|j}|jr|j�t |dd���\}}|�|jd|���}t|j j ��dkr~||juru|js|j r|j � |��n|j �|d���}|j r||_||jur||_|jr |j|_|js[|jr'|�t&j|j� ��}n-|jr&|�t&j|j� ��}t1|d d��r|jn|j} | �td ���|�|��} t8�d |�d ���t8�d| ����t8�d| ����d} d} d} d}d}i}t=d|j��}tA|| | ���}|snd}tC|d��r&tE|j#tH��r |j#j}tA|| |���}tA|| |���}tA|| |���}|�%��tC|j&d��r2tO|j&j%��r|j&�%��|j(dkrd|_)||j*_+tY|��D�]f\}}|�-||||���\}}}t1|jdd��}d|j.vr|�/||��nd}|�1|�0| ��}| �|nt'j1| |fd���} |�| �|nte| |d���} |�| �|nte| |d���} |�|�|nte||d���}|j*�3||j4|j5��|_5|jj6rb|j7�[| �Y| �W|j j8j9}i} d|j.vr| nd| d<d|j.vr|nd| d<|�7tud(| | d�| ��|���}|jj6s|j;��|dz|j;zdkr�|�<|�=| d����|s{|�<|�=| d ����|�<|�=| d!����|�<|�=|d"����~ ~ ~ ~t&j>�?��d#\} } } }��h|j(r tC|d$��rt�|d$��|�<|�=| d����|s{|�<|�=| d ����|�<|�=| d!����|�<|�=|d"����|�A��}!|s|�A��nd}"|s|�A��nd}#|s|�A��nd}$|j7�S|"�Q|#�O|jj6sCd|j.vr|!nd|d<d|j.vr|$nd|d<|�7tud(|"|#d�|����}n|�i}t�|��}|!�,|!�C���D��||�d%�<t�|�F����D]7}%|%�G|�d&���s|�H|%��||�d&|%��<�8t�|"|#|| �'��S))r�z+dataloader must implement a working __len__NrTr�Fr�r�r��_is_accelerate_preparedz\Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size.r�r�r�r�r )�make_multiple_ofr�rr�rrr})r�r�r�r�r�r�r�� eval_losses� eval_preds�eval_label_ids�eval_inputs_ids)NNNNrr�rbr�r�)Jr�r_r"r�rJr+r"r�r�r?rVr�rQr�r�rUr,rNr�r�r�r�rOrzr;r�r�rrrr�rr:rBr�r�r@rr[rkr*rr`r r.r�r�rir��catrGr�rwr~r r�r/r�rQr�� add_arrays�_gather_and_numpifyr�r~rR�finalizer[r�rr�r!r�r6rP)&r�rr�r�r�r�r�rbr�r�r� losses_host� preds_host� labels_host� inputs_hostr9rr�eval_losses_gathererrk�preds_gatherer�labels_gatherer�inputs_gathererrjr}r�r�r�rr r�r r � eval_loss�predsr�� inputs_idsrs& r�r�zTrainer.prediction_loop�s����y���*�%�%� L��J�K�K� K�7K�7W�3�3�]a�]v�� � $� N���)?�!�$�1��M�M�M�D�A�q�� � ���e� � �S�S�� �t��'� (� (�A� -� -�%�4�:�2E�2E��,�Q�04�0D�Q�� �(�(��/�/�/��%�3�3�E�4�3�P�P� � �#� #�"�� ��D�J�&�&�%*��"��(� 4�!%�!3����� K��"� K����u�}�T�[��I�I����$� K����u�~�d�k��J�J���z�#<�e�D�D� '�J� '� '��&� � � ��n��� ��(�(��4�4� �� � �:�{�:�:�:�;�;�;�� � �6� �6�6�7�7�7�� � �2�j�2�2�3�3�3�.2� �DH� �EI� �EI� �"&�� "����D�O�,�,� �8��\�dn�o�o�o��#� u� $� ��z�9�-�-� A�*�Z�=O�Qm�2n�2n� A�#-�#5�#@� �6�z�<�br�s�s�s�N�7� �L�cs�t�t�t�O�7� �L�cs�t�t�t�O� � � � � � � �4�>�6� *� *� "�x���8K�/L�/L� "� �N� � � !� !� !� �?�a� � ��D�J�0:���-�%�j�1�1�. [�. [�L�D�&�#'�#7�#7��v�G[�it�#7�#u�#u� �D�&�&�%�d�j�2C�[�Q�Q�O�@H�D�Ld�@d�@d��#�#�F�?�$;�<�<�<�jn� ������Z�0�0��(3�(;�f�f���K�Y_�K`�fg�Ah�Ah�Ah� ��!�'1�'9�V�V�}�Z�Y_�os�?t�?t�?t� ��!�(3�(;�f�f��{�\b�rv�Aw�Aw�Aw� ��(�#�*�"�M�&�{�M�QU�V�V�V��  �0�C�C�D�$�*�VZ�Vb�c�c�D�L��y�+� ��'�3� �8N�S^�Sj�#'�#3�#B�#T�L�#%�L�<B�d�F^�<^�<^�[�[�dh�L��*�<D��H`�<`�<`�[�[�fj�L��*�"�2�2�&�e�:��e�e�Xd�e�e�'3�3���G� �y�+� [��,�8�d�Q�h�$�Jf�=f�jk�=k�=k�%�/�/��0H�0H��Vc�0d�0d�e�e�e�+�i�"�-�-�d�.F�.F�z�S_�.`�.`�a�a�a�#�.�.�t�/G�/G� �Ue�/f�/f�g�g�g�#�.�.�t�/G�/G� �Uf�/g�/g�h�h�h� ��[�+�� �&�&�(�(�(�DZ�A� �Z��k�� �?� #�w�t�W�5�5� #� �D�'� "� "� "� �'�'��(@�(@��m�(\�(\�]�]�]�#� a� � %� %�d�&>�&>�z�<�&X�&X� Y� Y� Y� � &� &�t�'?�'?� �M]�'^�'^� _� _� _� � &� &�t�'?�'?� �M^�'_�'_� `� `� `�(�1�1�3�3� �1E�O��'�'�)�)�)�4��6J�T�O�,�,�.�.�.�PT� �7K�U�_�-�-�/�/�/�QU� � � � ,��!��%��I�0�&�6<�t�?W�5W�5W� � �]a�O�H� %�6>�$�BZ�6Z�6Z� � �`d�O�H� %��*�*�>�+t�e�W`�+t�+t�ds�+t�+t�u�u�G�G� �_��G�(��0�0�� � �3<�>�>�3C�3C�3H�3H�3J�3J�G�(�/�/�/� 0��� � ���'�'� I� I�C��>�>�%6�"9�"9�"9�:�:� I�8?� � �C�8H�8H��,�4�4�s�4�4�5���%�9�g�co�p�p�p�pr�c���|�dSt��rt||��}nGt��rt|��}n)|jjt jkrt|��}t|��S)r N) r�rJr�r�r�rKrfrLrBrIrs r�rrzTrainer._gather_and_numpify�sx�� �?� �F� !� #� #� 2�,�W�d�;�;�G�G� $� &� &� 2� ��)�)�G�G� �Y� $� �(@� @� @�(��1�1�G��g�&�&�&r�c���|���sdSddg}tj�tj�|jjd����rdttj�|jjd����5}|���}ddd��n #1swxYwYnd}|}|D])}||vr#|� d��r||z }�!|d|��z }�*||kr�ttj�|jjd��d��5}t� d|����|� |��ddd��n #1swxYwY|j� d��tjd ��|j���s5|j�d ��|j���dSdS) z8Add SageMaker Checkpointing patterns to .gitignore file.Nz*.sagemaker-uploadingz*.sagemaker-uploadedz .gitignorer�� r;z"Writing .gitignore file. Content: g�?z'Add *.sagemaker patterns to .gitignore.)rrirTr�r��repo� local_dirr>rSrnrr?r?�git_addr��sleep� is_repo_clean� git_commit�git_push)r��patternsr��current_content�content�patterns r��_add_sm_patterns_to_gitignorez%Trainer._add_sm_patterns_to_gitignore�s\���)�)�+�+� � �F�+�-C�D�� �7�>�>�"�'�,�,�t�y�':�L�I�I� J� J� !��b�g�l�l�4�9�#6� �E�E�F�F� +�!�"#�&�&�(�(�� +� +� +� +� +� +� +� +� +� +� +���� +� +� +� +��!�O�"��� .� .�G��g�%�%��#�#�D�)�)�.��w�&�G�G��~�G�~�~�-�G�� �o� %� %��b�g�l�l�4�9�#6� �E�E�s�K�K� !�q�� � �K�'�K�K�L�L�L����� � � � !� !� !� !� !� !� !� !� !� !� !���� !� !� !� !� � ���,�'�'�'� � �3�����y�&�&�(�(� !� �I� � �!J� K� K� K� �I� � � � � � � � !� !s$�B:�:B>�B>�43E3�3E7�:E7c �� �i}td��r"|jjj�|jjj}d|vr1|jjdkrt d���|d|j_|jj���� td��r<gd�}td!i� fd�|D����}td��r|jj|_� � d��}td ��s|rtd ���n/|r&|jj st� d ��||_� � d ��d |jji}td��r||d<n|�� ��|jjdkrcd|_t'jt*��t'jd��krt-|jj���|d<nt d���t/d!i|��|_|jj|_dt7j|j��j���vr*t?j |j|jj!���|_tE|jj#d d��du|_$tE|jj#dd��du|_%tE|jj#dd��du|_|j%ru|jj#j&}dD]?}tO|||jj(�)|tE||�������@|j*r|jj+rt d���|j$r*tE|jdd���|�,��|jj-r7|j$s|j%r)|jj.r|j$rdnd}t |�d����|j$r5|jj#jj/dkr|jj0rt d���|jj-r;|j%r6dtc|jj#j&j2��vrt d ���dSdSdS)"Nr�r r z�The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`.)� split_batches�dispatch_batches� even_batches�use_seedable_samplerc�<��i|]}|��|����Sr�)r6)r�r��accelerator_configs �r�r/z>Trainer.create_accelerator_and_postprocess.<locals>.<dictcomp>�s*���W�W�W�E�5�,�0�0��7�7�W�W�Wr�z1.1.0� non_blockingr�zp`non_blocking` is only supported in accelerate v0.30.0 and above. Please upgrade accelerate to use this feature.zx`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both.�gradient_accumulation_kwargsr5�dataloader_configTr�)�tp_size�torch_tp_pluginz4Requires accelerate>1.3.0 to use Tensor Parallelism.�use_gather_object)r�r)�limit_all_gathers�activation_checkpointingz�The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic when using FSDP.r3� DeepSpeedr�zJ can't be used with `save_only_model` along with `load_best_model_at_end`.�zo`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP�SHARDED_STATE_DICTzWsave_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'r�)3rur�r�r�r�r"�to_dictr�� data_seedr6rDr�rrCr�r5rur�� is_tp_enabledrr�r�r�r�rVr�r�r�r�r�r!r�r��eval_use_gather_objectr;rwrJrQrr/rHr�r�rrr r&� zero_stager�r r�) r��grad_acc_kwargsr�r�r�r�rr��wrapperr�s @r�r-z*Trainer.create_accelerator_and_postprocess�s����� "�8� ,� ,� X���1M�1j�1v�"�i�:�W�O� �/� )� )��y�4�q�8�8� �L���� 9H� �8T�� �5�!�Y�9�A�A�C�C�� "�8� ,� ,� B� m� m� m� � 7�!�!�W�W�W�W�EV�W�W�W�!�!� �'�w�/�/� B�.2�i�.A�!�+�)�-�-�n�=�=� �&�x�0�0� :�� �!�G���� � � �D�I�$C� ����O����.:� � *����=�>�>�>� �� � :� �� #�8� ,� ,� ,�(9�D�$� %� %� �K�K�*� +� +� +� �9� �q� � �!%�D� ��}�/�0�0�7�=��3I�3I�I�I�*C�D�I�L]�*^�*^�*^��&�'�'� �!W�X�X�X�'�.�.��.�.���#�/�B��� �'�"3�D�4H�"I�"I�"T�"Y�"Y�"[�"[� [� [�#,�#4��$�� �8X�$�$�$�D� � %,�D�,<�,B�DV�X\�$]�$]�ei�$i��!�&�t�'7�'=�}�d�S�S�[_�_���$�T�%5�%;�=N�PT�U�U�]a�a��� � � ��*�0�<�K�J� k� k��� �U�D�I�,A�,E�,E�e�W�U`�bg�Mh�Mh�,i�,i�j�j�j�j��3� �� �8X� � �'���� � $� /����<Q�SW�)X�)X�)`� � ,� ,� .� .� .� �I� %� u��*� u�.2�.B� u�� �0� u� &*�%>�J�k�k�F�G���s�s�s�t�t� t� � %� �� �&�7�B�a�G�G�� �.�H��B��� � �I� %� x��$� x�%��D�,<�,B�,N�,^�(_�(_�_�_��v�w�w� w�  x� x� x� x�`�_r�c���ddlm}|jjj}||jj��|_|jj|_|j�|j |��dS)zO Sets values in the deepspeed plugin based on the Trainer args rr$N) r2r%rVrwr5r&r��deepspeed_configr4r�)r�r�r%� ds_plugins r�rz#Trainer.propagate_args_to_deepspeedYsn�� Q�P�P�P�P�P��$�*�;� �!9�!9�)�:P�:W�!X�!X� ��%.�%;�%B� �"���5�5�d�i�AU�V�V�V�V�Vr�c�6�|j�rt|j��r�ddlm}ddlm}t|jj|��r$||j��|j j j _ t|jdd��tjkr�|jjjjjrqt)jt,��t)jd��krD|j j j �|jjjjd���dSdSdSdSdSdS)Nr)� PeftConfig)�fsdp_auto_wrap_policyrz0.27.0T)r�)rQr�r�r�r��peft.utils.otherr�r��active_peft_configrVrwrr�r;r�rRr�rF�bnb_4bit_quant_storagerkrr�r��set_mixed_precision)r�r�r�s r�rz"Trainer._fsdp_qlora_plugin_updateses:�� � � �N�4�:�$>�$>� � '� '� '� '� '� '� >� >� >� >� >� >��$�*�7��D�D� h�F[�F[�\`�\f�Fg�Fg�� �&�2�C��� �$9�4�@�@�DV�De�e�e��J�+�?�V�h�f��M�"4�5�5�� �h�8O�8O�O�O�� �&�2�F�F��J�+�?�V�ae�G������ � � � �f�e�e�e�O�Or�c��g}d}t|��D]6} |�t|�����&#t$rYnwxYwt |��dkod|dvo|jp|jdu}|r2 td�|D����}n#ttf$rYnwxYw|��|j j r,|j � |�����}tj|��rR|�|��}|j jdkr-|���dkr|�d��}||fS)Nrr�c�h�g|]/}|d�d�������0S)r�r�)�ner=)r�rs r�r�z-Trainer.get_batch_samples.<locals>.<listcomp>�s6��)f�)f�)f�u�5��?�+=�+=�d�+C�+C�*H�*H�*J�*J�)f�)f�)fr�r )r�rtri� StopIterationr?rXr�r=rrr�r�rVr4r�� is_tensorr�r�r�� unsqueeze)r�ruryr�rzr{rb�count_num_items_in_batchs r�r-zTrainer.get_batch_samplesus���� �!���{�#�#� � �A� ��$�$�T�.�%9�%9�:�:�:�:�� � � � ���� ���� � � � �� "� ��M�!�,�,� � �.�6��)��5� !� $� � �%(�)f�)f�Xe�)f�)f�)f�%g�%g�"�"���~�.� � � ��� ���� � )��y�6� W�%)�%5�%<�%<�=O�%P�%P�%T�%T�%V�%V�"���1�2�2� I�%7�%:�%:�6�%B�%B�"��9�?�Q�&�&�+=�+A�+A�+C�+C�q�+H�+H�);�)E�)E�a�)H�)H�&��0�0�0s!�":� A�A�;B�B)�(B)rac�^�|j}|dk}t|��rt|��nd}|�6t||jzd��}|rt j|j|z��}|ry|�|��}|jdkr"||zt||zdk��z} ||z} n|t j|j��} |�|��|jz} nE|jdkr#tj } |}||jz}|j|z} ntd|j�����| ||| |||fS)a Calculates and returns the following values: - `num_train_epochs` - `num_update_steps_per_epoch` - `num_examples` - `num_train_samples` - `epoch_based` - `len_dataloader` - `max_steps` rNr zYargs.max_steps must be set to a positive value if dataloader does not have a length, was ) rlr_r?r�r�r�r�rmrr��sys�maxsizer") r�r�rrarlrdrerbrrmrcs r�rz#Trainer.set_initial_training_values�s����N� ��!�m� �,6�z�,B�,B�L��Z������ � %�),�^�t�?_�-_�ab�)c�)c� &�� Z� �I�d�&;�>X�&X�Y�Y� � � ��,�,�Z�8�8�L��~��!�!�#,�0J�#J�S�� :�:�Q�>�N�N�$� � %.�0F�$F�!�!�#'�9�T�-B�#C�#C� �$(�$5�$5�j�$A�$A�D�DY�$Y�!�!� �^�a� � �"�{� �)2� &�1�D�N�B�L� $��1G� G� � ��%��N�%�%��� � � &� � � � � � � r�) NNNNNNNNNNr�NN)r�Nr)F)TN)NNN)NNNNN)NNrXrYNNr�)FN)NF)NNr)Nr) NNNNNNNNN)rbTNN)�r:� __module__� __qualname__�__doc__�trainer_pt_utilsr�r�r�r�r�r�r r'r�Modulergrrrrr�r r1r!rr.rrQr�r7r�r�r8� Optimizerr\�LambdaLRr�rr�r��propertyr��setterr�r�rar�r�rSr�r�r�rnro�Samplerr�rr�rrrr�rrrrJrR� parameterrrU� staticmethodr5rrrr8r�rPrJr4rzr�r�r�r�r�rKr�rYr�rVr�r)rIr,r�r�r r rr rr!rRrOrbrXrirkrrrqr:r{rrrVr�r�rWrNrZrr�rTr�rPr�r�r�r>rgrFr r_rfr�rrr�r-rrr-rr�r�r�r�r�<s%������\�\�~l�k�k�k�k�k�k�k�k�k�k�k�k�k��_�[�+=�w�dh�i�i�i�:>�"&�04�W[�Y]� �>B�04�FJ�59�jv�ae�hl�!Mc�Mc��_�b�i��5�6�Mc� �Mc� � �-� Mc�  ��g��@R�&R� S�T� Mc� �u�W�d�3��<�.@�BT�%T�U�V� Mc�#� �)�+=�?U�We�e� f� �Mc��X�b�/�&9�:�;�Mc�$�H�-�Mc�"�(�N�+;�T�+A�"B�C�Mc��D��1�2�Mc��(�5�;�#8�9�8�E�K�D\�De�;f�f�g�Mc�#+�5��e�k�6K�1L�d�SV�X[�S[�n�1\�+]�"^�Mc� (0��%�,�� �9U�W\�Wc�9c�0d�'e�!Mc�Mc�Mc�j�i�Mc�^ �%�8�$;�<�%�%�%��X�%���1�1�1���1� ���&<�<�<�" 5� 5� 5�<�<�<� 8� 8� 8� � � � \� \� \�;�;�.@�;�x�X[�}�;�;�;�;�BEI�'�'�%�'�4<�S�M�'� �'�'�'�'�$5�H�U�[�-=�-E�$F�5�5�5�5�6!X�j�!X�!X�!X�!X�F+�g�+�(�5�;�CS�C[�:\�+�+�+�+�Z;9�;9���s�G�|�9L�0M�;9�Yc�;9�;9�;9�;9�z W�� W�J� W� W� W� W�DZ��Z�Z�Z�Z�  �$�s�)�  �  �  �  �A�A�A�FR�R�R� F�F�F�J�J��%��U�X�=O�=Y�8Y�2Z�)[�J�J�J�J� �DH�t/�t/��t/�(0��(A�t/� �s�C�x��t/�t/�t/��\�t/�l !�!�3�!�5�;�CX�!�!�!�!�& K�z� K�c� K� K� K� K�� � �Z� �H�S�M� �S� � � ��\� �62�e�N�D��c��N�,J�&K�62�62�62�62�pA�%���S�#�X��0N�*O�A�WZ�A�ei�jm�ot�jt�eu�A�A�A�A�0a�C�a�a�a�a� � � � �)�)�)�)�V38�u�}�����.-�-�-�8Z�Z�Z�Z�|>B�=A�48� o�o� (��s�D�y�)9� :�o��^�T�#�s�(�^�T�9�:�o�'�t�C�y�1� o�o�o�o�dim�cH�cH�cH�cH�J���(V=�V=�V=�V=�px�x�x�t � � �����,hl�'^�'^�'^�'^�R)_�)_�)_�V$"�$"�$"�L2J�2J�2J�h0i�0i�0i�d?1�?1�?1�Bk5�k5�k5�Z1�1�1�(5�5�5�2%9�%9�%9�RLP�KO��+5�;?�=A�R�R��8�^�$4�d�3��:�6F�$F�G�H�R�$�H�d�3��:�.>�-?��-F�$G�H�R�� R� ��d�3�i��(� R� �%��� 6�7�8� R��(�N�#3�S�#8�9�:�R� �w��W� �%� &�R�R�R�R�h_�_��S�%�Z�(�_�h�u�o�_�QU�_�_�_�_�.�5���s�):�#;���e�l�TW�FW�@X�����$�d�3��e�l�C�6G�0H�+H�&I��d�SV�X]�^c�^j�lo�^o�Xp�Sp�Nq����� 5�5�5� � �H�T�N� � � � �ae�N!�N!��Y�N!�(,�S�%�� �c�8I�2J�-J�(K�N!� ��N!�N!�N!�N!�`2;�2;�2;�2;�h2�t�2�2�2�2� 0�t� 0� 0� 0� 0�1:�1:�X�c�]�1:�4�1:�1:�1:�1:�fF>�F>�H�S�M�F>�F>�F>�F>�P)L�)L��� �)L�)L�)L�)L�V "� "� "��1F�RW�"�"� �c��"�"�"�"�6:�:�:�:�:�6FJ�+/�!'� f�f��u�W�d�3��<�.@�%@�A�B�f��d�3�i�(�f�� f� �c�5�j� � f�f�f�f�Rhn�>t�>t�#�>t�2:�4��9�2E�>t�ad�>t� �>t�>t�>t�>t�H04�+/�!'� Tu�Tu��Tu��Tu�'�t�n� Tu� �d�3�i�(� Tu� � Tu� �Tu�Tu�Tu�Tu�l����0,0� g&�g&��y�g&��S�%�� �c� 1�2�2�3�g&�#� g&� �d�3�i�(� g&� �x�� �%�x�� �'=�x�� �?U�U� V� g&�g&�g&�g&�R��c�5���s�9J�3K�.K�)L�����$%�%�(�3�-�%�%�%�%�(#'�!%�,0�$(�(,�-1�48�/3�48�Hh�Hh��3�-�Hh��#��Hh��C��c��D�(�)� Hh� �S�M� Hh� !�� � Hh��S�$�s�)�T�)�*�Hh��C��c��D�0�1�Hh��s�D��I�t�+�,�Hh��C��c��D�0�1�Hh�Hh�Hh�Hh�T@9�@9�@9�D4�4�4�):��#�"&� J �J � �� �J ��J ���}� J � �3�-� J � �J �J �J �J �h04�+/�!'� xq�xq��xq��xq�'�t�n� xq� �d�3�i�(� xq� � xq� �xq�xq�xq�xq�t'�'�'� %!�%!�%!�%!�Nox�ox�ox�b W� W� W� W���� +1�+1�+1�Z: �%�: �3=�: �WZ�: �: �: �: �: �: r�r�(r�r�r?r�r<�importlib.metadatar�r�rQr�rir�r�r]r�rHr�r6�collections.abcr�pathlibr�typingrrrrr � integrationsr �huggingface_hub.utilsrnr�r�r�r��torch.distributed� distributedrU�huggingface_hubr r r� packagingrr�torch.utils.datarrrrrr�r�configuration_utilsr�data.data_collatorrrr� debug_utilsrr�!feature_extraction_sequence_utilsr�feature_extraction_utilsrrbrr �image_processing_utilsr!�integrations.deepspeedr"r#r$�integrations.tpur%� modelcardr&�modeling_utilsr'r(r)�models.auto.modeling_autor*r+� optimizationr,r-�processing_utilsr.� pytorch_utilsr/r0�tokenization_utils_baser1�trainer_callbackr2r3r4r5r6r7r8r9r�r:r;r<r=r>r?r@rArBrCrDrErFrGrHrIrJrKrLrM� trainer_utilsrNrOrPrQrRrSrTrUrVrWrXrYrZr[r\r]r^r_r`rarbrcrdr�rerfrgrhrirjrkrlrmrnrorprqrrrsrtrurvrwrxryrzr{r|r}r~rr�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r�r��utils.deprecationr��utils.quantization_configr�r^rc�utils.notebookr�rr�r��torch_xla.core.xla_modelr�� xla_modelr��torch_xla.debug.metricsr?r9rP� torch_xla� XLA_VERSIONr�r��torch_xla.distributed.spmd�spmdr��torch_xla.runtime�runtimer��!smdistributed.modelparallel.torch� modelparallelrv�smdistributed.modelparallel� SMP_VERSIONrur�r�r�r��safetensors.torchr�r�r�� accelerater�r�r��accelerate.stater�r1r�r�r�r�r�r�r�� DATA_SAMPLERSr��accelerate.data_loaderr�r�r�r�r�r�rA� get_loggerr:rr�rXr[rEr3r\r�r�r�r�r��<module>r�s� �������� � � � ����� � � � ��������� � � � � � � � � � � � � � � � � � � � � � � � � � � � ����� � � � �����#�#�#�#�#�#�������@�@�@�@�@�@�@�@�@�@�@�@�@�@� ������ -�,�,�,�,�,����� � � � � � � � � � �A�A�A�A�A�A�A�A�A�A�������������c�c�c�c�c�c�c�c�c�c�c�c�c�c�������1�1�1�1�1�1�\�\�\�\�\�\�\�\�\�\�<�<�<�<�<�<�<�<�G�G�G�G�G�G�<�<�<�<�<�<�`�`�`�`�`�`�`�`�6�6�6�6�6�6�e�e�e�e�e�e�e�e�e�e�1�1�1�1�1�1�&�&�&�&�&�&�R�R�R�R�R�R�R�R�R�R���������3�2�2�2�2�2�2�2�,�,�,�,�,�,���������=�<�<�<�<�<� � � � � � � � � � � � � � � � � � � � ���������������������������������������������,��������������������������������������������������2K�J�J�J�J�J�J�J�J�J�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�*�V/�.�.�.�.�.�9�9�9�9�9�9�)�)��,���>���9�8�8�8�8�8�8� 8�������������������O�O�O����� #�)�)�)�)�)�)�)�)�)�)�)�)�)�)�)�)�)�)�4�4�4�4�4�4�*�W�]�;�7�7�=�7�=�I_�;`�;`�`���'�/�/�/�/�/�/�/�/�/�&�&�&�&�&�&��"������&�3�3�3�3�3�3�3�3�3�F�F�F�F�F�F� -�� �k� :� :�m�g�m�F�>S�>S� S��g�g�g�g�g�g�g�g�g�g�g�g�g� %��������������������������?�:�:�:�:�:�:�:�:�<�<�<�<�<�<�1�1�1�1�1�1�������������������#�O�M��w�}�'�(�(�=�7�=��+A�+A�A�A�>�>�>�>�>�>��w�}�'�(�(�=�7�=��+B�+B�B�B�@�@�@�@�@�@��/�0�0� �����?�>�>�>�>�>�>���8�$�$�9�8�8�8�8�8�8� � � ����7�7�7�&���M�M�M���������� �� �H� %� %��)��)����� �$����&��`N �`N �`N �`N �`N �`N �`N �`N �`N �`N r�
Memory