� <��g�4��ddlZddlZddlZddlmZddlmZmZmZm Z ddl Zddlm Z eGd�d��Z ddejd eed edeefd�Z ddejd eed ede eejeeeefffd�ZGd�d��Zejd��ZGd�d��Zdd ed efd�ZdS)�N)� dataclass)�Dict�List�Optional�Tuple)�get_assets_pathc�z�eZdZUdZdZeed<dZeed<dZe ed<ed��Z eed <d Ze ed<dZe ed <dS)� VadOptionsarVAD options. Attributes: threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH. It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets. neg_threshold: Silence threshold for determining the end of speech. If a probability is lower than neg_threshold, it is always considered silence. Values higher than neg_threshold are only considered speech if the previous sample was classified as speech; otherwise, they are treated as silence. This parameter helps refine the detection of speech transitions, ensuring smoother segment boundaries. min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out. max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer than max_speech_duration_s will be split at the timestamp of the last silence that lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be split aggressively just before max_speech_duration_s. min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms before separating it speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side g�?� thresholdN� neg_thresholdr�min_speech_duration_ms�inf�max_speech_duration_si��min_silence_duration_msi�� speech_pad_ms) �__name__� __module__�__qualname__�__doc__r�float�__annotations__rr �intrrr��b/home/asafur/pinokio/api/open-webui.git/app/env/lib/python3.11/site-packages/faster_whisper/vad.pyr r s��*�I�u��M�5��"#��C�#�#�#�#(�5��<�<��5�/�/�/�#'��S�'�'�'��M�3��rr �>�audio�vad_options� sampling_rate�returnc �\�|�tdi|��}|j}|j}|j}|j}|j}d} |j} ||zdz}|| zdz}||z| z d|zz } ||zdz}|dzdz}t|��}t��}tj |d| |jd| zz f��}||�dd�� d��}d }g}i}|�t|d z d��}d}dx}}t|��D�]\}}||kr|r d}||kr| |z}||kr |sd}| |z|d <�.|rm| |z|d z | kr[|r1||d<|�|��i}||krd }n||d <dx}x}}n(| |z|d<|�|��i}dx}x}}d }��||kr_|r]|s| |z}| |z|z |kr|}| |z|z |kr��||d<|d|d z |kr|�|��i}dx}x}}d }��|r)||d z |kr||d<|�|��t|��D�]\\}}|dkr)t#td|d |z ��|d <|t|��dz kr�||dzd |dz }|d|zkr_|dxxt#|dz��z cc<t#td||dzd |dzz ��||dzd <��t#t%||d|z��|d<t#td||dzd |z ��||dzd <��3t#t%||d|z��|d<��^|S)a�This method is used for splitting long audios into speech chunks using silero VAD. Args: audio: One dimensional float array. vad_options: Options for VAD processing. sampling rate: Sampling rate of the audio. kwargs: VAD options passed as keyword arguments for backward compatibility. Returns: List of dicts containing begin and end samples of each speech chunk. N��br��Fg333333�?g{�G�z�?T�start�endr)r rrr rrr�len� get_vad_model�np�pad�shape�reshape�squeeze�max� enumerate�appendr�min)rrr�kwargsrrr rr�window_size_samplesr�min_speech_samples�speech_pad_samples�max_speech_samples�min_silence_samples�!min_silence_samples_at_max_speech�audio_length_samples�model�padded_audio�speech_probs� triggered�speeches�current_speech�temp_end�prev_end� next_start�i�speech_prob�speech�silence_durations r�get_speech_timestampsrJ-s��"�� *�*�6�*�*��%�I��-�M�(�?��'�=��)�A��-�M�&�)?�?�$�F��&��6��=��-�-� � � � � � !�� (�*A�A�D�H��(5��(:�T�(A�%��u�:�:��O�O�E��6� ��&��Q��:M�)M�M�N��L��5��-�-�a��4�4�5�5�=�=�a�@�@�L��I��H��N��I��,�d�3�3� ��H��H�z�#�L�1�1�2�2��;��9�$�$�(�$��H��H�$�$�0�1�4� ��9�$�$�i�$��I�&9�A�&=�N�7�#�� $�q�(�N�7�,C�C�FX�X�X�� (0��u�%��/�/�/�!#��(�(� %�I�I�.8�N�7�+�34�4��4�:��(;�a�(?��u�%��/�/�/�!#��34�4��4�:��!� ��-�'�'�Y�'�� 3�.��2��#�a�'�8�3�6W�W�W�#��#�a�'�8�3�6I�I�I��(0��u�%�"�5�)�N�7�,C�C�&�'�'��O�O�N�3�3�3�!#��34�4��4�:��!� �� (� !�N�7�$;� ;�?Q�Q�Q� 4��u��'�'�'��x�(�(�� 6��6�6�!�#�a��;M�)M�"N�"N�O�O�F�7�O��H� � ��!�!�!�'��A��w�7�&��-�G��!�&8�"8�8�8��u� � � ��%5��%:�!;�!;�;� � � �+.��8�A��E�?�7�3�6F�!�6K�K�L�L�,�,��Q��(�(�!$��,�f�U�m�>P�.P�Q�Q�!�!��u� �,/��8�A��E�?�7�3�6H�H�I�I�,�,��Q��(�(� ��(�&��-�:L�*L�M�M��F�5�M�M��Or�chunksc��|s)ddd�}tjgtj��g|gfSg}g}|D]W}|d|z|d|zd�}|�||d|d��|�|��X||fS)zCollects audio chunks.r)� start_time�end_time��dtyper(r))r,�array�float32r3)rrKr�chunk_metadata�audio_chunks�chunks_metadata�chunks r�collect_chunksrW�s��B�� 2�:�.�.�.�/�.�1A�A�A��L��O��/�/��.�=�8��e��}�4� � �� E�%��.�5��<�"?�@�A�A�A��~�.�.�.�.��(�(rc�h�eZdZdZd deededefd�Z dded e ed efd�Z ded efd�ZdS)�SpeechTimestampsMapz3Helper class to restore original speech timestamps.r$rKr�time_precisionc��||_||_g|_g|_d}d}|D]X}||d|z z }|d}|j�|d|z ��|j�||z��YdS)Nrr(r))rrZ�chunk_end_sample�total_silence_beforer3)�selfrKrrZ�previous_end�silent_samplesrVs r�__init__zSpeechTimestampsMap.__init__�s��*��,�� "��$&��!�� M� M�E��e�G�n�|�;�;�N� ��<�L��!�(�(��u��)F�G�G�G��%�,�,�^�m�-K�L�L�L�L� M� MrN�time�chunk_indexr c�z�|�|�|��}|j|}t||z|j��S�N)�get_chunk_indexr]�roundrZ)r^rbrcr]s r�get_original_timez%SpeechTimestampsMap.get_original_time�sD�� .�.�t�4�4�K�#�8��E��)�D�0�$�2E�F�F�Frc��t||jz��}ttj|j|��t|j��dz ��S)Nr&)rrr4�bisectr\r*)r^rb�samples rrfz#SpeechTimestampsMap.get_chunk_index�sN��T�D�.�.�/�/��M�$�/��8�8��%�&�&��*� � � r)r$re)rrrrr�dictrrarrrhrfrrrrYrY�s��=�=�M�M�t�D�z�M�#�M�s�M�M�M�M�&&*� G� G�� G��c�]� G� � G� G� G� G� �E� �c� � � � � � rrYc��tj�t��d��}tj�t��d��}t ||��S)zReturns the VAD model instance.zsilero_encoder_v5.onnxzsilero_decoder_v5.onnx)�os�path�joinr�SileroVADModel)�encoder_path�decoder_paths rr+r+�sL��7�<�<�� 1� 1�3K�L�L�L��7�<�<�� 1� 1�3K�L�L�L��,��5�5�5rc�6�eZdZd�Z d dejdedefd�ZdS) rqc�.� ddl}n"#t$r}td��|�d}~wwxYw|��}d|_d|_d|_d|_|�|dg|��|_ |�|dg|��|_ dS)Nrz8Applying the VAD filter requires the onnxruntime packager&F��CPUExecutionProvider)� providers�sess_options)�onnxruntime�ImportError�RuntimeError�SessionOptions�inter_op_num_threads�intra_op_num_threads�enable_cpu_mem_arena�log_severity_level�InferenceSession�encoder_session�decoder_session)r^rrrsrz�e�optss rrazSileroVADModel.__init__s�� J�� )�)�+�+��$%��!�$%��!�$)��!�"#��*�;�;��-�.�� <� � �� +�;�;��-�.�� <� � ��s�� &�!�&r"�@r�num_samples�context_size_samplesc �*�|jdks Jd��|jd|zdks Jd��|jd}tjd|dfd��}tj||fd��}|�|d |��}|d |d�f}d|dd�d f<tj|dd��}tj||gd��}|�d ||z��}d}|jd} g} td| |��D]E}|j� dd||||z�i��d}| � |��Ftj| d� ��}|�|d d��}g} tj||jdd� ��D]K}|j� d|� d��|d��\}}| � |��Ltj| d� �� d ��}|S)Nr$z>Input should be a 2D array with size (batch_size, num_samples)r&rz.Input size should be a multiple of num_samples�rRrOr'.i'�input)�axis)r��state)�ndimr.r,�zerosr/�roll�concatenate�ranger��runr3�splitr�r0�stack)r^rr�r�� batch_sizer��context� batched_audio�encoder_batch_size�num_segments�encoder_outputsrF�encoder_output�decoder_outputs�window�outs r�__call__zSileroVADModel.__call__si�� J�!�O�O�O�K� �O�O� �K��N�[�(�A�-�-�-�;� .�-�-��[��^� ��!�Z��-�Y�?�?�?��(� �-�.�� j�"�k�B�B� ��&:�%:�%;�%;� ;�<��2��'�'�1�a�(�(��'?��C�C� �%�-�-�b�+�@T�2T�U�U� �"��$�*�1�-��q�,�(:�;�;� 3� 3�A�!�1�5�5��w� �a�!�6H�2H�.H� I�J��N� �"�"�>�2�2�2�2��a�@�@�@��'�/�/� �B��D�D��h�~�~�/C�A�/F�Q�O�O�O� (� (�F��-�1�1��q� 1� 1�E�B�B��J�C�� "�"�3�'�'�'�'��h��Q�/�/�/�7�7��;�;�� rN)r"r�)rrrrar,�ndarrayrr�rrrrqrq�s]�� 4VX�.�.��Z�.�.1�.�OR�.�.�.�.�.�.rrqc�p�|sgSd}g}g}|j|zdz}|j|z}|dd}t|��D]�\} } | dkr+| d|| dz dkr| dxx|z cc<| t|��dz kr+| d|| dzdkr| dxx|zcc<| d|z |kr,||z dkr#|�|||d��| d}g}| d}|�| d| df��|�|||d��|S)Nrr#r(r&r))r(r)�segments)rrr2r*r3)� segments_listrr�curr_end�seg_idxs�merged_segments�edge_padding�chunk_length� curr_start�idx�segs r�merge_segmentsr�Js�� H��H��O��,�}�<��D�L��4�}�D�L��q�!�'�*�J��m�,�,�4�4��S��7�7��7�|�m�C�!�G�4�U�;�;�;��G��,��]�#�#�a�'�'�'��5�z�M�#��'�2�7�;�;�;��E� � � �l�*� � � ��u�:� �"�\�1�1�h��6K�a�6O�6O��"�"�'�#� (�� W��J��H��u�:��W��s�5�z�2�3�3�3�3�� r)Nr)r)rj� functoolsrn�dataclassesr�typingrrrr�numpyr,�faster_whisper.utilsrr r�rrlrJ�strrWrY� lru_cacher+rqr�rrr�<module>r�s�� !�!�!�!�!�!�.�.�.�.�.�.�.�.�.�.�.�.��0�0�0�0�0�0��@)-��J�J� �:�J��*�%�J��J� �$�Z�J�J�J�J�\AF�)�)� �:�)�#�D�z�)�:=�)� �4�� T�$�s�C�x�.�1�1�2�)�)�)�)�.# �# �# �# �# �# �# �# �L��6�6��6�H�H�H�H�H�H�H�H�V*�*�z�*�#�*�*�*�*�*�*r