� ���g�{� ��ddlZddlZddlmZddlmZddlmZmZddl m Z m Z m Z ddl Z ddlmZddlmZddl mZdd lmZdd lmZd d lmZd d lmZd dlmZd dlmZd dlm Z d dlm!Z"d dl#m$Z$m%Z%m&Z&m'Z'm(Z(d dl)m*Z*m+Z+e e,e-e-fe,e-e,dfZ.e-ej/��Z0e j1e2��Z3Gd�de-��Z4Gd�de5��Z6dZ7ej/ddgej8gd�ej9gd�iZ:dZ;ej<ej=d��kr d d!gZ>gd"�Z?n*ej<ej=d#��kr d$d!gZ>gd%�Z?nd&d'gZ>gd(�Z?ej/ej8ej9gZ@d)�e@D��ZAd*�e@D��ZBej/d+giZCe7gZDeBeAeCgZEd,ZFgd-�ZGd.e-d/eHfd0�ZId1e eJeKe-fd/eJe-e eKe-d2fffd3�ZLd4e-d.e-d/eHfd5�ZMd4e-d.e-d/eHfd6�ZNd7e e-geKe-fd/eJe-eKe-ffd8�ZO dJd.e-d9e-d:e eKe-d;e ed/eKe-f d<�ZPdKd9e-d;e ed/eJe-eKe-ffd=�ZQ dKd>e-d;e ed/e.fd?�ZR dJd@eKe-d;e edAe eSd/eKe.fdB�ZTGdC�d2eKe-��ZUGdD�dEeJe-eUf��ZVGdF�dGeKe-��ZWGdH�dIeJe-eWf��ZXdS)L�N)�partial)� has_magic)�Path�PurePath)�Callable�Optional�Union)� url_to_fs)�HTTPFileSystem)� HfFileSystem)�version)� thread_map�)�config)�DownloadConfig)� _split_re)�Split)�logging)�tqdm)�!_prepare_path_and_storage_options� is_local_path�is_relative_path� xbasename�xjoin)�glob_pattern_to_regex�string_to_dict�c��eZdZdS)�UrlN��__name__� __module__� __qualname__r��c/home/asafur/pinokio/api/open-webui.git/app/env/lib/python3.11/site-packages/datasets/data_files.pyrr"��������Dr$rc��eZdZdS)�EmptyDatasetErrorNr rr$r%r(r(&r&r$r(zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*�train�training)� validation�valid�dev�val)�test�testing�eval� evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z {keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z 2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**c�>�i|]}|d�t|D����S)c�Z�g|](}tD]}|�|t������)S�)�keyword�sep)�"KEYWORDS_IN_FILENAME_BASE_PATTERNS�format�NON_WORDS_CHARS��.0r6�patterns r%� <listcomp>z<dictcomp>.<listcomp>M�O�� � � � �9� � � � ���w�O��<�<� � � � r$��SPLIT_KEYWORDS�r<�splits r%� <dictcomp>rDL�K��&�&�&� �  � � �%�e�,� � � �&�&�&r$c�>�i|]}|d�t|D����S)c�Z�g|](}tD]}|�|t������)Sr5)�"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSr9r:r;s r%r>z<dictcomp>.<listcomp>Ur?r$r@rBs r%rDrDTrEr$z**z*[])z README.mdz config.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonr=�returnc�D��t�fd�tD����S)Nc3� �K�|]}|�vV�� dS�Nr)r<�wilcard_characterr=s �r%� <genexpr>z%contains_wildcards.<locals>.<genexpr>ts*�����Y�Y�0A� �G�+�Y�Y�Y�Y�Y�Yr$)�any�WILDCARD_CHARACTERS)r=s`r%�contains_wildcardsrQss'��� �Y�Y�Y�Y�EX�Y�Y�Y� Y� Y�Yr$�patterns� DataFilesListc���t|t��rd�|���D��St|t��r t|giSt|t ��r�t d�|D����r�|D]o}t|t��rFt|��dkr3d|vr/t|�d��tt f��std|������pd�|D��}tt|����t|��krtd|�����d �|D��St|iStt |����S) a/ Take the data_files patterns from the user, and format them into a dictionary. Each key is the name of the split, and each value is a list of data files patterns (paths or urls). The default split is "train". Returns: patterns: dictionary of split_name -> list of patterns c�d�i|]-\}}t|��t|t��r|n|g��.Sr��str� isinstance�list)r<�key�values r%rDz%sanitize_patterns.<locals>.<dictcomp>�s:��k�k�k�J�C�QV��C���:�e�T�#:�#:�G�%�%���k�k�kr$c3�@K�|]}t|t��V��dSrL)rX�dict�r<r=s r%rNz$sanitize_patterns.<locals>.<genexpr>�s,����A�A�W�z�'�4�(�(�A�A�A�A�A�Ar$�rC�pathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got c��g|] }|d�� S�rCrr^s r%r>z%sanitize_patterns.<locals>.<listcomp>�s��?�?�?�7�g�g�&�?�?�?r$z*Some splits are duplicated in data_files: c��i|]B}t|d��t|dt��r|dn|dg��CS)rCr`rVr^s r%rDz%sanitize_patterns.<locals>.<dictcomp>�s]�������G�G�$�%�%�*�W�V�_�VZ�:[�:[�'r�w�v���bi�jp�bq�ar���r$) rXr]�itemsrW�SANITIZED_DEFAULT_SPLITrYrO�len�get� ValueError�set�sanitize_patterns)rRr=�splitss r%rjrjws����(�D�!�!�1�k�k�Zb�Zh�Zh�Zj�Zj�k�k�k�k� �H�c� "� "�1�'�(��4�4� �H�d� #� #�1� �A�A��A�A�A� A� A� 7�#� � ���w��-�-���G� � ��)�)��7�*�*�"�7�;�;�v�#6�#6��d� �D�D�+�%�B�x�B�B����+� @�?�h�?�?�?�F��3�v�;�;���3�v�;�;�.�.� �!V�f�!V�!V�W�W�W���'���� � ,�X�6� 6� ��h���0�0�0r$�matched_rel_pathc���d�t|��jjD��}d�t|��jjD��}t|��t|��kS)u� When a path matches a pattern, we additionnally check if it's inside a special directory we ignore by default (if it starts with a double underscore). Users can still explicitly request a filepath inside such a directory if "__pycache__" is mentioned explicitly in the requested pattern. Some examples: base directory: ./ └── __pycache__ └── b.txt >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**") True >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt") True >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*") False >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*") False c�<�g|]}|�d���|��S��__�� startswith�r<�parts r%r>z6_is_inside_unrequested_special_dir.<locals>.<listcomp>�s-��"u�"u�"u�D�_c�_n�_n�os�_t�_t�"u�4�"u�"u�"ur$c�<�g|]}|�d���|��Srorqrss r%r>z6_is_inside_unrequested_special_dir.<locals>.<listcomp>�s-��%o�%o�%o�t�Y]�Yh�Yh�im�Yn�Yn�%o�d�%o�%o�%or$)r�parent�partsrf)rlr=�data_dirs_to_ignore_in_path�data_dirs_to_ignore_in_patterns r%�"_is_inside_unrequested_special_dirrz�sg��8#v�"u�H�=M�4N�4N�4U�4[�"u�"u�"u��%o�%o�x��7H�7H�7O�7U�%o�%o�%o�"� �*� +� +�s�3Q�/R�/R� R�Rr$c��d�t|��jD��}d�t|��jD��}t|��t|��kS)u: When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot. Users can still explicitly request a filepath that is hidden or is inside a hidden directory if the hidden part is mentioned explicitly in the requested pattern. Some examples: base directory: ./ └── .hidden_file.txt >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**") True >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*") False base directory: ./ └── .hidden_dir └── a.txt >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**") True >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*") False >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*") False base directory: ./ └── .hidden_dir └── .hidden_file.txt >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**") True >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*") True >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*") False >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*") True >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*") False c�d�g|]-}|�d��rt|��dhk�+|��.S��.�rrrirss r%r>zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>�sJ��"�"�"��T�_�_�S�=Q�=Q�"�Z]�^b�Zc�Zc�hk�gl�Zl�Zl��Zl�Zl�Zlr$c�d�g|]-}|�d��rt|��dhk�+|��.Sr}rrss r%r>zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>�sJ��%�%�%��D�O�O�C�4H�4H�%�QT�UY�QZ�QZ�_b�^c�Qc�Qc��Qc�Qc�Qcr$)rrwrf)rlr=�hidden_directories_in_path�hidden_directories_in_patterns r%�?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dirr��sw��j"�"�!�"2�3�3�9�"�"�"��%�%�!�'�*�*�0�%�%�%�!� �)� *� *�c�2O�.P�.P� P�Pr$�pattern_resolverc �� � � �tD�]+� � �dd��} ||��}n#t$rY�2wxYwt|��dkr�t ��� |D]X}t t |��tt � ������}|�J�� �|d���Ytd�� D����rtdt�d� �d ����� fd �tD��t� d �tD��z ��z}� fd �|D��cS��-tD]|� g}� ���D]Q\}}|D]I} ||��}n#t$rY�wxYwt|��dkr|�|��n�J�R|r� fd �|D��cS�}td|�d|�����)a+ Get the default pattern from a directory or repository by testing all the supported patterns. The first patterns to return a non-empty list of data files is returned. In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS. z{split}�*rNrCc3�LK�|]}tjt|�� V�� dSrL)�re�matchrrBs r%rNz+_get_data_files_patterns.<locals>.<genexpr>s1����F�F�e�r�x� �5�1�1�1�F�F�F�F�F�Fr$zSplit name should match 'z '' but got 'z'.c�6��g|]}|�v�t|����Sr�rW)r<rCrks �r%r>z,_get_data_files_patterns.<locals>.<listcomp>s%���W�W�W�E�u�PV���S��Z�Z���r$c�,�h|]}t|����Srr�rBs r%� <setcomp>z+_get_data_files_patterns.<locals>.<setcomp>s��A�A�A��#�e�*�*�A�A�Ar$c�@��i|]}|��|���g��S)rb)r9)r<rC� split_patterns �r%rDz,_get_data_files_patterns.<locals>.<dictcomp>s/���Z�Z�Z�5�E�M�0�0�u�0�=�=�>�Z�Z�Zr$c�"��i|] }|�|�� Srr)r<rC� patterns_dicts �r%rDz,_get_data_files_patterns.<locals>.<dictcomp>$s ���N�N�N�E�E�=��/�N�N�Nr$zCouldn't resolve pattern z with resolver )�ALL_SPLIT_PATTERNS�replace�FileNotFoundErrorrfrirrr�addrOrhr�DEFAULT_SPLITS�sorted�ALL_DEFAULT_PATTERNSrd�append) r�r=� data_files�p�p_parts� sorted_splits�non_empty_splitsrCrRr�r�rks @@@r%�_get_data_files_patternsr��s������,�[�[� ��'�'� �3�7�7�� �)�)�'�2�2�J�J�� � � � � �H� ���� �z�?�?�Q� � �"�u�u�F�� -� -��(��1���7L�Y�Wd�Me�Me�7f�7f�g�g���*�*�*�� � �7�7�+�,�,�,�,��F�F�v�F�F�F�F�F� `� �!^�Y�!^�!^�TZ�!^�!^�!^�_�_�_�W�W�W�W�^�W�W�W�Z`��A�A�.�A�A�A�A�[�[��M�[�Z�Z�Z�M�Z�Z�Z� Z� Z� Z� �.� O� O� ���,�2�2�4�4� � �O�E�8�#� � ���!1�!1�'�!:�!:�J�J��(�����H������z�?�?�Q�&�&�$�+�+�E�2�2�2��E�'�� � O�N�N�N�N�=M�N�N�N� N� N� N� O� �b��b�b�P`�b�b� c� c�cs� 1� >�>�! E-�- E: �9E: � base_path�allowed_extensions�download_configc���� � ��t|��rt||��}nDt|��r3tj�|��dtjz}nd}t||���\}}t|fi|��\}� tt��t|��hz � t|j t��r|j n |j d}|dkr|dznd�i}|dkr'tjt#jd��krd|d <� � �fd �|j|fd d i|�����D��}��{�fd�|D��} t+| ��t+|��krLt-t|��t| ��z ��} t.�d|�d| ����n|} | s,d|�d�} ��| dt-�����z } t3| ���| S)a� Resolve the paths and URLs of the data files from the pattern passed by the user. You can use patterns to resolve multiple local files. Here are a few examples: - *.csv to match all the CSV files at the first level - **.csv to match all the CSV files at any level - data/* to match all the files inside "data" - data/** to match all the files inside "data" and its subdirectories The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix other than a forward slash /. More generally: - '*' matches any character except a forward-slash (to match just the file or directory name) - '**' matches any character including a forward-slash / Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested. The same applies to special directories that start with a double underscore like "__pycache__". You can still include one if the pattern explicilty mentions it: - to include a hidden file: "*/.hidden.txt" or "*/.*" - to include a hidden directory: ".hidden/*" or ".*/*" - to include a special directory: "__special__/*" or "__*/*" Example:: >>> from datasets.data_files import resolve_pattern >>> base_path = "." >>> resolve_pattern("docs/**/*.py", base_path) [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py'] Args: pattern (str): Unix pattern or paths or URLs of the data files to resolve. The paths can be absolute or relative to base_path. Remote filesystems using fsspec are supported, e.g. with the hf:// protocol. base_path (str): Base path to use when resolving relative paths. allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions). For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"] download_config ([`DownloadConfig`], *optional*): Specific download configuration parameters. Returns: List[str]: List of paths or URLs to the local or remote files that match the patterns. r��r��filez://�hfz0.20.0F� expand_infoc�j��g|]�\}}|ddksQ|�d��r�tj�tj�|����rMt |���v�st |�����t|�����|����r|n�|z���S)�typer��islink) rg�osr`�isfile�realpathrrzr�rr)r<�filepath�info�files_to_ignore� fs_pattern�protocol_prefixs ���r%r>z#resolve_pattern.<locals>.<listcomp>gs������� �H�d� ��L�F� "� "�t�x�x��'9�'9� "�b�g�n�n�R�W�M]�M]�^f�Mg�Mg�>h�>h� "� �x� � �� 7� 7�2�8�Z�H�H�8�O�PX�Zd�e�e�8��'�'��8�8�X���o�PX�>X�8� 7� 7r$�detailTNc���g|]G}t�fd�t|���d��dd�D�����E|��HS)c3�&�K�|] }d|z�vV�� dS)r~Nr)r<�suffixr�s �r%rNz-resolve_pattern.<locals>.<listcomp>.<genexpr>ss-�����g�g�&�3��<�#5�5�g�g�g�g�g�gr$r~rN)rOrrC)r<r�r�s �r%r>z#resolve_pattern.<locals>.<listcomp>pso��� � � ���g�g�g�g�I�h�DW�DW�D]�D]�^a�Db�Db�cd�ce�ce�Df�g�g�g�g�g� � � � � r$z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find '�'z with any supported extension )rrrr�r`� splitdriver7rr ri�FILES_TO_IGNORErrX�protocolrWr�HF_HUB_VERSIONr �parse�globrdrfrY�loggerr�r�)r=r�r�r��storage_options�fsr�� glob_kwargs� matched_paths�out�invalid_matched_files� error_msgr�r�r�s ` @@@r%�resolve_patternr�(sm������`�� � ��� �7�+�+��� �w� � ���G�&�&�w�/�/��2�R�V�;� � �� �@��Zi�j�j�j��G�_��w�:�:�/�:�:�N�B� ��/�*�*�i��.@�.@�-A�A�O�(���c�:�:�N�r�{�{�� �A��H�*2�f�*<�*<�h��&�&�"�O��K��4���F�1�W�]�8�5L�5L�L�L�%*� �M�"�������%�b�g�g�J�J�d�J�k�J�J�P�P�R�R����M��%� � � � �)� � � �� �s�8�8�c�-�(�(� (� (�$(��]�);�);�c�#�h�h�)F�$G�$G� !� �K�K�A�7�A�A�i~�A�A� � � ���� �+�1�w�1�1�1� � � )� �T�$�?Q�:R�:R�T�T� T�I�� �*�*�*� �Jr$c��tt||���} t|��S#t$rt d|�d���d�wxYw)uA Get the default pattern from a directory testing all the supported patterns. The first patterns to return a non-empty list of data files is returned. Some examples of supported patterns: Input: my_dataset_repository/ ├── README.md └── dataset.csv Output: {'train': ['**']} Input: my_dataset_repository/ ├── README.md ├── train.csv └── test.csv my_dataset_repository/ ├── README.md └── data/ ├── train.csv └── test.csv my_dataset_repository/ ├── README.md ├── train_0.csv ├── train_1.csv ├── train_2.csv ├── train_3.csv ├── test_0.csv └── test_1.csv Output: {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'], 'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]} Input: my_dataset_repository/ ├── README.md └── data/ ├── train/ │ ├── shard_0.csv │ ├── shard_1.csv │ ├── shard_2.csv │ └── shard_3.csv └── test/ ├── shard_0.csv └── shard_1.csv Output: {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...], 'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]} Input: my_dataset_repository/ ├── README.md └── data/ ├── train-00000-of-00003.csv ├── train-00001-of-00003.csv ├── train-00002-of-00003.csv ├── test-00000-of-00001.csv ├── random-00000-of-00003.csv ├── random-00001-of-00003.csv └── random-00002-of-00003.csv Output: {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'], 'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'], 'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']} In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS. )r�r�zThe directory at z doesn't contain any data filesN)rr�r�r�r()r�r��resolvers r%�get_data_patternsr��si��h��)�_�]�]�]�H�j�'��1�1�1�� �j�j�j�� ^�I� ^� ^� ^�_�_�ei�i�j���s �(�A� data_filec��t||���\}}t|fi|��^}}t|t��r#|�|��}|j|jfSt|t��r�|�tj ��rttj |j ���}d|ttj ��dzd�� ddd��z}|�|��}|j|jfS|�|��}dD]}||vrt||��fcS�dS) Nr�)�endpoint�tokenzhf://rz /resolve/�@)�ETag�etag�mtimer)rr rXr � resolve_path�repo_id�revisionr rrr� HF_ENDPOINTr�rfr�r�rW) r�r�r�r��_� resolved_path�hffsr�rZs r%�_get_single_origin_metadatar��sU��"C�9�^m�!n�!n�!n��I�� �y� 4� 4�O� 4� 4�F�B���"�l�#�#�=���� �2�2� ��$�m�&<�<�<� �B�� '� '�=�I�,@�,@��AS�,T�,T�=��V�%7��?T�U�U�U���i��F�,>�(?�(?�!�(C�(E�(E�F�N�N�{�\_�ab�c�c�c� ��)�)�)�4�4� ��$�m�&<�<�<� �7�7�9� � �D�(�%�%�� �$�;�;���S� �N�N�$� $� $� $� � �2r$r�� max_workersc ��|�|n tj}ttt|���||t dt |��dkpd���S)Nr�zResolving data files�)r�� tqdm_class�desc�disable)r�&HF_DATASETS_MULTITHREADING_MAX_WORKERSrrr��hf_tqdmrf)r�r�r�s r%�_get_origin_metadatar��s\�� "-�!8�+�+�f�>k�K� ��+�_�M�M�M���� #��J���2�%�-�� � � �r$c���eZdZdZdeedeeddf�fd� Zdd�Ze dd eed e j j d e ed e eed e eddf d���Ze dd eed e ed e eed e eddf d���Ze dd eed e ed e eed e eddf d���Zddd�de eede eeddfd�Z�xZS)rSa� List of data files (absolute local paths or URLs). It has two construction methods given the user's data files patterns: - ``from_hf_repo``: resolve patterns inside a dataset repository - ``from_local_or_remote``: resolve patterns from a local path Moreover, DataFilesList has an additional attribute ``origin_metadata``. It can store: - the last modified time of local files - ETag of remote files - commit sha of a dataset repository Thanks to this additional attribute, it is possible to hash the list and get a different hash if and only if at least one file changed. This is useful for caching Dataset objects that are obtained from a list of data files. r��origin_metadatarINc�X��t���|��||_dSrL)�super�__init__r�)�selfr�r�� __class__s �r%r�zDataFilesList.__init__s)��� ������$�$�$�.����r$�otherc�D�tg|�|�|j|jz��SrL)rSr��r�r�s r%�__add__zDataFilesList.__add__s%���_�t�_�e�_�d�.B�U�EZ�.Z�[�[�[r$rR� dataset_infor�r�r�c��d|j�d|j�d|pd���d��}|�||||���S)Nzhf://datasets/r��/r��r�r�r�)�id�sha�rstrip� from_patterns)�clsrRr�r�r�r�s r%� from_hf_repozDataFilesList.from_hf_reposc��\�\�_�[�[�|�7G�[�[�)�/�WY�[�[�b�b�cf�g�g� �� � � � �>P�bq�!� � � r$c��|�|n1t��������}|�||||���S�Nr�)r�resolve�as_posixr�)r�rRr�r�r�s r%�from_local_or_remotez"DataFilesList.from_local_or_remote-sT��"+�!6�I�I�D�F�F�N�N�<L�<L�<U�<U�<W�<W� �� � � � �>P�bq�!� � � r$c �D�|�|n1t��������}g}|D]I} |�t ||||������*#t $rt |��s�Y�FwxYwt||���}|||��S�Nr�r�)rr�r��extendr�r�rr�)r�rRr�r�r�r�r=r�s r%r�zDataFilesList.from_patterns:s���"+�!6�I�I�D�F�F�N�N�<L�<L�<U�<U�<W�<W� �� �� � �G� ��!�!�#��"+�+=�(7� ��������%� � � � ��)�)����� ����/�z�?�[�[�[���s�:��/�/�/s�&A%�%B�B�� extensions� file_namesrrc����g�|rJd�d�|D����}��tjd|�d�����|rJd�d�|D����}��tjd|�d������r"t �fd�|D��|j� ��St t |��|j� ��S) N�|c3�>K�|]}tj|��V��dSrL�r��escape)r<�exts r%rNz'DataFilesList.filter.<locals>.<genexpr>Ys*����"H�"H�c�2�9�S�>�>�"H�"H�"H�"H�"H�"Hr$z.*(z )(\..+)?$c3�>K�|]}tj|��V��dSrLr)r<�fns r%rNz'DataFilesList.filter.<locals>.<genexpr>\s*����!E�!E�B�"�)�B�-�-�!E�!E�!E�!E�!E�!Er$z.*[\/]?(z)$c�L���g|]�t�fd��D�������� S)c3�B�K�|]}|����V��dSrL)r�)r<r=r�s �r%rNz2DataFilesList.filter.<locals>.<listcomp>.<genexpr>`s0�����7i�7i�U\�� � �i�8P�8P�7i�7i�7i�7i�7i�7ir$)rO)r<r�rRs @�r%r>z(DataFilesList.filter.<locals>.<listcomp>`s=����j�j�j�y�C�7i�7i�7i�7i�`h�7i�7i�7i�4i�4i�j��j�j�jr$)r�)�joinr�r��compilerSr�rY)r�rr� ext_pattern� fn_patternrRs @r%�filterzDataFilesList.filterTs����� � G��(�(�"H�"H�Z�"H�"H�"H�H�H�K� �O�O�B�J�'D�[�'D�'D�'D�E�E� F� F� F� � D����!E�!E�*�!E�!E�!E�E�E�J� �O�O�B�J�'A�:�'A�'A�'A�B�B� C� C� C� � S� �j�j�j�j�D�j�j�j� $� 4���� � !��d���T�=Q�R�R�R� Rr$)r�rSrIrS�NNN)r!r"r#�__doc__rYrW�SingleOriginMetadatar�r�� classmethod�huggingface_hub�hf_api� DatasetInforrr�r�r�r� __classcell__�r�s@r%rSrSsO���������"/�4��9�/�t�DX�?Y�/�^b�/�/�/�/�/�/�\�\�\�\�� $(�26�48�  �  ��s�)�  �&�,�8�  ��C�=�  � %�T�#�Y�/�  � "�.�1�  � �  �  �  ��[�  ��$(�26�48�  �  ��s�)�  ��C�=�  �%�T�#�Y�/�  � "�.�1�  � �  �  �  ��[�  ��$(�26�48� 0�0��s�)�0��C�=�0�%�T�#�Y�/� 0� "�.�1� 0� � 0�0�0��[�0�448�[_�S�S�S�%�d�3�i�0�S�EM�d�SV�i�EX�S� �S�S�S�S�S�S�S�Sr$c�&�eZdZdZe ddeeeeee ffde ede eede e ddf d���Z e ddeeeeee ffd e jjde ede eede e ddf d ���Ze ddeeeeee ffde ede eede e ddf d ���Zddd �d e eede eeddfd�ZdS)� DataFilesDicta� Dict of split_name -> list of data files (absolute local paths or URLs). It has two construction methods given the user's data files patterns : - ``from_hf_repo``: resolve patterns inside a dataset repository - ``from_local_or_remote``: resolve patterns from a local path Moreover, each list is a DataFilesList. It is possible to hash the dictionary and get a different hash if and only if at least one file changed. For more info, see [`DataFilesList`]. This is useful for caching Dataset objects that are obtained from a list of data files. Changing the order of the keys of this dictionary also doesn't change its hash. NrRr�r�r�rIc��|��}|���D]=\}}t|t��r|nt�||||���||<�>|Sr�)rdrXrSr��r�rRr�r�r�r�rZ�patterns_for_keys r%r�z"DataFilesDict.from_local_or_remotews���c�e�e��%-�^�^�%5�%5� � � !�C�!��.� �>�>�� � �"�7�7�$�'�'9�$3� 8��� ��H�H�� r$r�c���|��}|���D]>\}}t|t��r|nt�|||||���||<�?|S)N)r�r�r�r�)rdrXrSr�) r�rRr�r�r�r�r�rZr s r%r�zDataFilesDict.from_hf_repo�s����c�e�e��%-�^�^�%5�%5� � � !�C�!��.� �>�>�� � �"�/�/�$�!-�'�'9�$3� 0��� ��H�H�� r$c��|��}|���D]=\}}t|t��r|nt�||||���||<�>|Sr�)rdrXrSr�rs r%r�zDataFilesDict.from_patterns�s���c�e�e��%-�^�^�%5�%5� � � !�C�!��.� �>�>�� � �"�0�0�$�'�'9�$3� 1��� ��H�H�� r$rrrc��t|����}|���D]\}}|�||���||<� |S)Nr)r�rdr)r�rrr�rZ�data_files_lists r%rzDataFilesDict.filter�sW���d�4�j�j�l�l��$(�J�J�L�L� \� \� �C��&�-�-��PZ�-�[�[�C��H�H�� r$r)r!r"r#rrr]rWr rYrSrrr�rrrr�r�rrr$r%rrgs������ � ��$(�26�48� ���s�E�$�s�)�]�":�;�;�<���C�=��%�T�#�Y�/� � "�.�1� � � ����[��*� $(�26�48� ���s�E�$�s�)�]�":�;�;�<��&�,�8���C�=� � %�T�#�Y�/� � "�.�1� � �����[��.�$(�26�48� ���s�E�$�s�)�]�":�;�;�<���C�=��%�T�#�Y�/� � "�.�1� � � ����[��,48�[_����%�d�3�i�0��EM�d�SV�i�EX�� ������r$rc����eZdZdZdeedeeeef�fd� Zd�Ze ddeedeeeddfd���Z dd ed ee dd fd �Z d eeddfd�Z �xZS)�DataFilesPatternsListz� List of data files patterns (absolute local paths or URLs). For each pattern there should also be a list of allowed extensions to keep, or a None ot keep all the files for the pattern. rRr�c�X��t���|��||_dSrL)r�r�r�)r�rRr�r�s �r%r�zDataFilesPatternsList.__init__�s+��� ������"�"�"�"4����r$c�D�tg|�|�|j|jz��SrL)rSr�r�s r%r�zDataFilesPatternsList.__add__�s%���_�t�_�e�_�d�.E��H`�.`�a�a�ar$NrIc�<�|||gt|��z��SrL)rf)r�rRr�s r%r�z#DataFilesPatternsList.from_patterns�s%���s�8�0�1�C��M�M�A�B�B�Br$r�r�rSc �x�|�|n1t��������}g}t||j��D]L\}} |�t ||||������-#t$rt|��s�Y�IwxYwt||���}t||��Sr�) rr�r��zipr�rr�r�rr�rS)r�r�r�r�r=r�r�s r%r�zDataFilesPatternsList.resolve�s��� "+�!6�I�I�D�F�F�N�N�<L�<L�<U�<U�<W�<W� �� �+.�t�T�5L�+M�+M� � � '�G�'� ��!�!�#��"+�+=�(7� ��������%� � � � ��)�)����� ����/�z�?�[�[�[���Z��9�9�9s�&A;�;B�Brc�F��t|�fd�|jD����S)Nc���g|]}|�z��Srr)r<r�rs �r%r>z;DataFilesPatternsList.filter_extensions.<locals>.<listcomp>�s ���e�e�e�7I�%� �2�e�e�er$)r&r�)r�rs `r%�filter_extensionsz'DataFilesPatternsList.filter_extensions�s2���$� �e�e�e�e�T�Md�e�e�e� � � r$rL)r!r"r#rrYrWrr�r�rr�rr�r.rrs@r%r&r&�s8��������� 5��s�)�5�!��$�s�)�!4�5�5�5�5�5�5�5�b�b�b��LP�C�C��C�y�C�6>�t�C�y�6I�C� �C�C�C��[�C�59�:�:��:�"�.�1�:� � :�:�:�:�. �D��I� �:Q� � � � � � � � r$r&c��eZdZdZe d deeeefdeeeddfd���Z d dedee dd fd �Z d eeddfd �Z dS)�DataFilesPatternsDictz[ Dict of split_name -> list of data files patterns (absolute local paths or URLs). NrRr�rIc��|��}|���D];\}}t|t��r|nt�||���||<�<|S)N)r�)rdrXr&r�)r�rRr�r�rZr s r%r�z#DataFilesPatternsDict.from_patterns�sz���c�e�e��%-�^�^�%5�%5� � � !�C�!��.�0E�F�F�� � �*�8�8�$�'9�9��� ��H�H�� r$r�r�rc��t��}|���D]\}}|�||��||<�|SrL)rrdr�)r�r�r�r�rZ�data_files_patterns_lists r%r�zDataFilesPatternsDict.resolvesL�� �o�o��-1�Z�Z�\�\� T� T� )�C�)�/�7�7� �?�S�S�C��H�H�� r$rc��t|����}|���D]\}}|�|��||<�|SrL)r�rdr.)r�rr�rZr3s r%r.z'DataFilesPatternsDict.filter_extensionssP���d�4�j�j�l�l��-1�Z�Z�\�\� N� N� )�C�)�/�A�A�*�M�M�C��H�H�� r$rL) r!r"r#rrr]rWrYrr�rr�r.rr$r%r0r0�s����������W[� � ��C��c��N�+� �AI�$�s�)�AT� � � � � ��[� �$59�����"�.�1�� � �����D��I��:Q������r$r0)NNrL)Yr�r�� functoolsrr�r�pathlibrr�typingrrr r� fsspec.corer �fsspec.implementations.httpr r � packagingr �tqdm.contrib.concurrentrr�r�downloadr�namingrrkr�utilsrrr��utils.file_utilsrrrrr�utils.py_utilsrr�tuplerWr�TRAINre� get_loggerr!r�rr�r(�SPLIT_PATTERN_SHARDED� VALIDATION�TESTrAr:�FSSPEC_VERSIONr�r8rHr��"DEFAULT_PATTERNS_SPLIT_IN_FILENAME�"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAME�DEFAULT_PATTERNS_ALLr�r�rPr��boolrQr]rYrjrzr�r�r�r�r��intr�rSrr&r0rr$r%�<module>rMs�� � � � � � � � �������������"�"�"�"�"�"�"�"�,�,�,�,�,�,�,�,�,�,�����!�!�!�!�!�!�6�6�6�6�6�6�(�(�(�(�(�(�������.�.�.�.�.�.�������$�$�$�$�$�$�������������������"�"�"�"�"�"�r�r�r�r�r�r�r�r�r�r�r�r�r�r�A�A�A�A�A�A�A�A��U�3��8�_�e�C�j�%��)�C�D���#�e�k�*�*�� �� �H� %� %�� � � � � �#� � � � � � � � �)� � � �a�� �K�'�:�&� ��;�;�;� �J�9�9�9��� �� ��=�7�=��4�4�4�4�*G�I\�)]�&�*�*�*�&�&�  ��]�W�]�;�7�7�7�7�*I�K^�)_�&�*�*�*�&�&�+A�B`�)a�&�*�*�*�&��+�u�/���<��&�&�  � &�&�&�"�&�&�  � &�&�&�"� �K�$����,�,��&�&���� ������Z��Z��Z�Z�Z�Z�#1��d�D�#�o� 6�#1�4��U�4�PS�9�Ve�Ke�Ef�@f�;g�#1�#1�#1�#1�LS��S�s�S�t�S�S�S�S�B;Q�VY�;Q�dg�;Q�lp�;Q�;Q�;Q�;Q�|)d�x���t�C�y�8H�/I�)d�d�SV�X\�]`�Xa�Sa�Nb�)d�)d�)d�)d�^/3�04� Y�Y� �Y��Y�!��c��+�Y��n�-� Y�  �#�Y� Y�Y�Y�Y�xXj�Xj��Xj�x��7O�Xj�[_�`c�ei�jm�en�`n�[o�Xj�Xj�Xj�Xj�z15������n�-�������015�!%����S� ���n�-���#��� � �� ����"^S�^S�^S�^S�^S�D��I�^S�^S�^S�BZ�Z�Z�Z�Z�D��m�+�,�Z�Z�Z�z2 �2 �2 �2 �2 �D��I�2 �2 �2 �j#�#�#�#�#�D��&;�!;�<�#�#�#�#�#r$
Memory