� ���g�a��r�dZddlZddlZddlZddlZddlZddlZddlZddlZddl Z ddl Z ddl m Z m Z ddlmZddlmZddlmZddlmZmZmZddlmZddlZdd lmZdd lmZdd lm Z d d l!m"Z"m#Z#d dl$m%Z%d dl&m'Z'm(Z(d dl)m*Z*m+Z+m,Z,d dl-m.Z.m/Z/m0Z0d dl1m2Z2m3Z3d dl4m5Z5d dl6m7Z7m8Z8d dl9m:Z:m;Z;d dl<m=Z=m>Z>m?Z?m@Z@d dlAmBZBd dlCmDZDmEZEd dlFmGZGd dlHmIZImJZJmKZKd dlLmMZMmNZNmOZOd dlPmQZQd dlRmSZSmTZTd dlUmVZVmWZWmXZXmYZYd dlZm[Z[d d l\m]Z]d d!l#m^Z^d d"l#m_Z`d d#lambZbd d$lcmdZdd d%lemfZfmgZgmhZhmiZid d&ljmkZkmlZlmmZmmnZnmoZompZpmqZqmrZrd d'lsmtZtmuZud d(lvmwZwerd d)lxmyZye^jze{��Z|Gd*�d+e}��Z~eGd,�d-����ZGd.�d/��Z�Gd0�d1e���Z�Gd2�d3e���Z�dS)4zDatasetBuilder base class.�N)�Iterable�Mapping)� dataclass)�partial)�Path)� TYPE_CHECKING�Optional�Union)�patch)� url_to_fs)�Pool)� thread_map�)�config�utils)�Dataset)� ArrowReader�ReadInstruction)� ArrowWriter� ParquetWriter�SchemaInferenceError)� DataFilesDict�DataFilesPatternsDict�sanitize_patterns)� DatasetDict�IterableDatasetDict)�DownloadConfig)�DownloadManager� DownloadMode)�StreamingDownloadManager�xjoin)�DatasetGenerationCastError�DatasetGenerationError�FileFormatError�ManualDownloadError)�Features)�is_remote_filesystem�rename)�Hasher)� DatasetInfo�DatasetInfosDict�PostProcessedInfo)�ArrowExamplesIterable�ExamplesIterable�IterableDataset)�DuplicatedKeysError)�"INVALID_WINDOWS_CHARACTERS_IN_PATH�camelcase_to_snakecase)�Split� SplitDict�SplitGenerator� SplitInfo)�$extend_dataset_builder_for_streaming)� CastError)�logging)�tqdm)�FileLock)� is_remote_url)�VerificationMode�get_size_checksum_dict�verify_checksums� verify_splits)� classproperty�convert_file_size_to_int�has_sufficient_disk_space�iflatmap_unordered� map_nested�memoize�size_str�temporary_assignment)�_number_of_shards_in_gen_kwargs�_split_gen_kwargs)� tracked_list)� DatasetModulec��eZdZdS)�InvalidConfigNameN)�__name__� __module__� __qualname__���`/home/asafur/pinokio/api/open-webui.git/app/env/lib/python3.11/site-packages/datasets/builder.pyrNrN\s�������DrSrNc��eZdZUdZdZeed<ejd��Z e e ejefed<dZ e eed<dZ e e eefed<dZe eed <d �Zd �Z dd ed e edefd�Zdededdfd�ZdS)� BuilderConfiga�Base class for `DatasetBuilder` data configuration. `DatasetBuilder` subclasses with data configuration options should subclass `BuilderConfig` and add their own properties. Attributes: name (`str`, defaults to `default`): The name of the configuration. version (`Version` or `str`, defaults to `0.0.0`): The version of the configuration. data_dir (`str`, *optional*): Path to the directory containing the source data. data_files (`str` or `Sequence` or `Mapping`, *optional*): Path(s) to source data file(s). description (`str`, *optional*): A human description of the configuration. �default�name�0.0.0�versionN�data_dir� data_files� descriptionc���tD]+}||jvr tdt�d|j�d�����,|j�8t |jt t f��std|j�����dSdS)Nz Bad characters from black list 'z ' found in 'z\'. They could create issues when creating a directory for this config on Windows filesystem.z/Expected a DataFilesDict in data_files but got )r1rXrNr\� isinstancerr� ValueError)�self� invalid_chars rT� __post_init__zBuilderConfig.__post_init__zs���>� � �L��t�y�(�(�'�q�7Y�q�q�gk�gp�q�q�q����)� �?� &�z�$�/�M�[p�Kq�/r�/r� &��`�t��`�`�a�a� a� '� &� &� &rSc� ���t�j�����t�j�����krdSt��fd��j���D����S)NFc3�d�K�|]*}|t�|��f|t�|��fkV��+dS�N)�getattr)�.0�k�oras ��rT� <genexpr>z'BuilderConfig.__eq__.<locals>.<genexpr>�sD�����]�]�1�A�w�t�Q�'�'�(�Q���1� � �,>�>�]�]�]�]�]�]rS)�set�__dict__�keys�all)rarjs``rT�__eq__zBuilderConfig.__eq__�st���� �t�}�!�!�#�#� $� $��A�J�O�O�,=�,=�(>�(>� >� >��5��]�]�]�]�]�� �HZ�HZ�H\�H\�]�]�]�]�]�]rS� config_kwargs�custom_features�returnc����d}|������dd����dd��d�vrK�d���dd��n,�d}tj�|��}|�d<�r��fd�t ���D���t d�����D����rYd�d��� ��D����}t|��d krtj ���}ntj ���}|�Nt��}|r|� |��|� |��|���}|rK|jd z|z}t|��t jkr|jd ztj |��z}|S|jS) a0 The config id is used to build the cache directory. By default it is equal to the config name. However the name of a config is not sufficient to have a unique identifier for the dataset being generated since it doesn't take into account: - the config kwargs that can be used to overwrite attributes - the custom features used to write the dataset - the data_files for json/text/csv/pandas datasets Therefore the config id is just the config name with an optional suffix based on these. NrXrZr[c�"��i|] }|�|�� SrRrR)rhri�config_kwargs_to_add_to_suffixs �rT� <dictcomp>z2BuilderConfig.create_config_id.<locals>.<dictcomp>�s.���.�.�.�9:��1�!�4�.�.�.rSc3�fK�|],}t|ttttf��V��-dSrf)r_�str�bool�int�float)rh�vs rTrkz1BuilderConfig.create_config_id.<locals>.<genexpr>�s5����k�k�a�:�a�#�t�S�%�!8�9�9�k�k�k�k�k�krS�,c3�K�|]F\}}t|��dztj�t|����zV��GdS)�=N)ry�urllib�parse� quote_plus�rhrir}s rTrkz1BuilderConfig.create_config_id.<locals>.<genexpr>�s\����"�"�GK�q�!�C��F�F�S�L�6�<�#:�#:�3�q�6�6�#B�#B�B�"�"�"�"�"�"rS� �-)�copy�pop�os�path�normpath�sortedro�values�join�items�lenr)�hash�update� hexdigestrXr�%MAX_DATASET_CONFIG_ID_READABLE_LENGTH)rarqrr�suffixr[�m� config_idrvs @rT�create_config_idzBuilderConfig.create_config_id�s���"!%��)6�);�);�)=�)=�&�&�*�*�6�4�8�8�8�&�*�*�9�d�;�;�;� �7� 7� 7�-�j�9�A�.�2�2�:�t�D�D�D�D�:�*�E���7�+�+�H�5�5��=E�.�z�:� )� E�.�.�.�.�>D�Ec�>d�>d�.�.�.� *��k�k�Ca�Ch�Ch�Cj�Cj�k�k�k�k�k� E����"�"�Om�Os�Os�Ou�Ou�"�"�"�����v�;�;��#�#�#�[�)G�H�H�F����%C�D�D�� � &����A�� !����� � � � �H�H�_� %� %� %��[�[�]�]�F� � �� �C��&�0�I��9�~�~�� L�L�L� �I��O�f�k�&�.A�.A�A� �� ��9� rS� base_path�download_configc��t|jt��r@|jrt ||j��n|}|j�||��|_dSdSrf)r_r\rr[r!�resolve)rar�r�s rT�_resolve_data_filesz!BuilderConfig._resolve_data_files�s^�� �d�o�'<� =� =� R�;?�=�W��i���7�7�7�i�I�"�o�5�5�i��Q�Q�D�O�O�O� R� RrSrf)rOrPrQ�__doc__rXry�__annotations__r�VersionrZr r r[r\rrr]rcrp�dictr&r�rr�rRrSrTrVrV`s>���������$�D�#����3@�5�=��3I�3I�G�X�e�E�M�3�.�/� 0�I�I�I�"�H�h�s�m�"�"�"�HL�J���}�.C�C�D�E�L�L�L�!%�K��#��%�%�%� b� b� b�^�^�^�/3�>�>��>�"�(�+�>� � >�>�>�>�@R�S�R�>�R�VZ�R�R�R�R�R�RrSrVc�,�eZdZdZdZeZgZdZdZ dLde e de e de e de e de e de e d e e d e eee fd e e d e ee eeefd e e de ede efd�Zd�Zd�Zede e fd���Zde e fd�Zddde e fd�Zedefd���Zde fd�Z dMdeee ffd�Z e!ee"��dee effd�������Z#ed���Z$dNd�Z%dOde fd �Z&d!�Z'e(j)de fd"���Z*ed#���Z+d$e d%e fd&�Z, dPd(e e d)e e-d*e ee.e fd+e ee/e fd,e e0de e d-e d.e eee fd/e ede efd0�Z1d1�Z2d2�Z3d3�Z4de fd4�Z5d5�Z6d6�Z7d7�Z8 dQd9e e9d+e ee/e fdee:e;ffd:�Z< dRd9ee e=e9fd;ed+e/d<efd=�Z>e9j?d8fd9ee=e9fd<ede:fd>�Z@d9ee=e9fde fd?�ZA dMd9e e de e deee eBfeBffd@�ZCdeBfdA�ZDdBe:dCeEe e fde e:fdD�ZFd9e dee e ffdE�ZGd9e dFe d,e0de e fdG�ZHe(j)d,ee0eIffdH���ZJe(j) dSdIeKd-e d.e ee efd/e efdJ���ZLdIeKdeMfdK�ZNdS)T�DatasetBuildera&Abstract base class for all datasets. `DatasetBuilder` has 3 key methods: - [`DatasetBuilder.info`]: Documents the dataset, including feature names, types, shapes, version, splits, citation, etc. - [`DatasetBuilder.download_and_prepare`]: Downloads the source data and writes it to disk. - [`DatasetBuilder.as_dataset`]: Generates a [`Dataset`]. Some `DatasetBuilder`s expose multiple variants of the dataset by defining a [`BuilderConfig`] subclass and accepting a config object (or name) on construction. Configurable datasets expose a pre-defined set of configurations in [`DatasetBuilder.builder_configs`]. Args: cache_dir (`str`, *optional*): Directory to cache data. Defaults to `"~/.cache/huggingface/datasets"`. dataset_name (`str`, *optional*): Name of the dataset, if different from the builder name. Useful for packaged builders like csv, imagefolder, audiofolder, etc. to reflect the difference between datasets that use the same packaged builder. config_name (`str`, *optional*): Name of the dataset configuration. It affects the data generated on disk. Different configurations will have their own subdirectories and versions. If not provided, the default configuration is used (if it exists). <Added version="2.3.0"> Parameter `name` was renamed to `config_name`. </Added> hash (`str`, *optional*): Hash specific to the dataset code. Used to update the caching directory when the dataset loading script code is updated (to avoid reusing old data). The typical caching directory (defined in `self._relative_data_dir`) is `name/version/hash/`. base_path (`str`, *optional*): Base path for relative paths that are used to download files. This can be a remote URL. features ([`Features`], *optional*): Features types to use with this dataset. It can be used to change the [`Features`] types of a dataset, for example. token (`str` or `bool`, *optional*): String or boolean to use as Bearer token for remote files on the Datasets Hub. If `True`, will get token from `"~/.huggingface"`. repo_id (`str`, *optional*): ID of the dataset repository. Used to distinguish builders with the same name but not coming from the same namespace, for example "rajpurkar/squad" and "lhoestq/squad" repo IDs. In the latter, the builder name would be "lhoestq___squad". data_files (`str` or `Sequence` or `Mapping`, *optional*): Path(s) to source data file(s). For builders like "csv" or "json" that need the user to specify data files. They can be either local or remote files. For convenience, you can use a `DataFilesDict`. data_dir (`str`, *optional*): Path to directory containing source data file(s). Use only if `data_files` is not passed, in which case it is equivalent to passing `os.path.join(data_dir, "**")` as `data_files`. For builders that require manual download, it must be the path to the local directory containing the manually downloaded data. storage_options (`dict`, *optional*): Key/value pairs to be passed on to the dataset file-system backend, if any. writer_batch_size (`int`, *optional*): Batch size used by the ArrowWriter. It defines the number of samples that are kept in memory before writing them and also the length of the arrow chunks. None means that the ArrowWriter will use its default value. **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the corresponding builder configuration class, set on the class attribute [`DatasetBuilder.BUILDER_CONFIG_CLASS`]. The builder configuration class is [`BuilderConfig`] or a subclass of it. N� cache_dir� dataset_name� config_namer�r��info�features�token�repo_idr\r[�storage_options�writer_batch_sizec �� �t|j�d��d��|_||_||_||_| |_| pi|_|rt|��n|j|_ | p|j |_ | �Mt| t��s8tjt| ��|t!||j������} dt#j|jj��jvr|�||d<| �| |d<| �| |d<||_|jd||d�|��\|_|_|�;|���}|�|�����|j|_|j |_ |jj|_|jj|_||_ |� ||j _!tE|p t0j#��|_$tK|j$��r|j$n#tLj'�(|j$��|_$|r$tSj*|j$t0j+��ntEt0j,��|_-tK|j-��r|j-n#tLj'�(|j-��|_-d|_.|�/��|_0tK|j$���s�tMj1|j$d � ��tLj'�*|j$te|j0���3���4d d ��d z��}tk|��5tLj'�6|j0��r�totMj8|j0����dkr�tLj'�6tLj'�*|j0t0j9����r8tt� d��twj<|j0��|_ nDtt�=d|j0�d|j �d���tMj>|j0��ddd��n #1swxYwY|j0|_?t�jAd��|_Bd|_Cd|_Dd|_Et�|��dS)N�.������r�r��r�r�r�r\r[)r�rrT��exist_ok�/�_z.lockrz<Overwrite dataset info from restored data version if exists.zOld caching folder z for dataset z- exists but no data were found. Removing it. �fileFrR)Gr2rP�splitrXr�r�r�r�r�r��DEFAULT_WRITER_BATCH_SIZE�_writer_batch_sizer_r� from_patternsrr�inspect� signature�BUILDER_CONFIG_CLASS�__init__� parametersrq�_create_builder_configrr��get_exported_dataset_infor��_info� builder_namer�rZr�r�ry�HF_DATASETS_CACHE�_cache_dir_rootr<r�r�� expanduser� posixpathr��DOWNLOADED_DATASETS_DIR�DOWNLOADED_DATASETS_PATH�_cache_downloaded_dir�_legacy_relative_data_dir�_build_cache_dir� _cache_dir�makedirsr�as_posix�replacer;�existsr��listdir�DATASET_INFO_FILENAME�loggerr*�from_directory�warning�rmdir� _output_dir�fsspec� filesystem�_fs� dl_manager� _record_infos� _file_formatr7)rar�r�r�r�r�r�r�r�r�r\r[r�r�rq� lock_paths rTr�zDatasetBuilder.__init__-s���$0���0E�0E�c�0J�0J�2�0N�O�O�� �#'�� �"����� ��� �.�4�"���DP�_�2�<�@�@�@�VZ�V_���"3�"U�t�7U��� � !�*�Z��*O�*O� !�&�4�!�*�-�-�#� .�U�D�L`� a� a� a����J� ��*�4�+D�+M�N�N�Y� Y� Y�^f�^r�(0�M�*� %� � !�*4�M�,� '� � �(0�M�*� %�*���&A�d�&A�' �#�$�' �' ��' �' �#�� �T�^� �<��1�1�3�3�D� �K�K�� � � � � %� %� %� �I��� �-����;�+����{�*�� ��� � � �!)�D�I� � #�9�#H��0H�I�I���$1�$�2F�$G�$G� u�D� � �R�W�M_�M_�`d�`t�Mu�Mu� �� � 6�I�N�4�/��1O� P� P� P��V�4�5�5� �"��T�7�8�8� @�D� &� &���#�#�D�$>�?�?� �"�*.��&��/�/�1�1����T�1�2�2� 2� �K��,�t� <� <� <� <��� � ��$�d�4�?�&;�&;�&D�&D�&F�&F�&N�&N�s�TW�&X�&X�[b�&b���I��)�$�$� 2� 2��7�>�>�$�/�2�2� 2��2�:�d�o�6�6�7�7�!�;�;��7�>�>�"�'�,�,�t���Hd�*e�*e�f�f�T�"�K�K�(f�g�g�g�(3�(B�4�?�(S�(S�D�I�����Q�$�/�Q�Q�PT�Pa�Q�Q�Q��������1�1�1� 2� 2� 2� 2� 2� 2� 2� 2� 2� 2� 2���� 2� 2� 2� 2� �?���.4�.?��.G�.G������#���!��� -�T�2�2�2�2�2s�'DR � R�Rc��|jSrf)rm�ras rT� __getstate__zDatasetBuilder.__getstate__�s ���}�rSc�2�||_t|��dSrf)rmr7)ra�ds rT� __setstate__zDatasetBuilder.__setstate__�s���� �,�T�2�2�2�2�2rSrsc��dSrfrRr�s rT�manual_download_instructionsz+DatasetBuilder.manual_download_instructions�����trSc���|j�d���rKt|j���s8|jjdk�r)ddlm}|jr>|j� d��dkr |j� d��dnd}|j�|j� dd��n|j }||j t|jj��d�z}|�|jd ��d}t!j|�|j n |�d |j ��|d |��}t!j|j|��}t$j�|��r|SdSdSdSdS) z]Check for the old cache directory template {cache_dir}/{namespace}___{builder_name} from 2.13� datasets.rWr)�_PACKAGED_DATASETS_MODULESr�rNz--�missing�___rY)rP� startswithr<r�rrX�packaged_modulesr�r��countr�r�r�r�r��getr�r�r�r��isdir)rar�� namespacer�r�r��legacy_relative_data_dir�legacy_cache_dirs rT�_check_legacy_cachez"DatasetBuilder._check_legacy_cache�s��� �O� &� &�{� 3� 3� 0�!�$�"6�7�7� 0�� � �I�-�-� D� D� D� D� D� D�6:�l�l�t�|�GY�GY�Z]�G^�G^�ab�Gb�Gb�� �*�*�3�/�/��2�2�hl�I�=A�\�=U�$�,�.�.�s�D�9�9�9�[_�[l�K�#�d�n�S���9I�5J�5J�5L�5L�&M�M�I�-�1�1�$�)�Y�G�G��J�D�'0�~�%.�%6��!�!�y�<`�<`�T�M^�<`�<`���� (�(� $� )�~�d�.B�D\�]�]� ��w�}�}�-�.�.� 0�/�/�' 0� 0� 0� 0�.�-� 0� 0rS�dataset_modulerLc���|j�d���r�t|j���s�t |j��ddhz �s�ddlm}ddlm }dtdtd tfd �}|j r>|j � d ��d kr |j �d ��d nd }tj|dd��5|jjdzt'jd|jji��z}d d d ��n #1swxYwY|�|jd��}|jjr>|jj|jjvr&|||jj|jj��}t3j|�|jn |�d|j��|d|��}t3j|j|��} t8j�| ��r|Sd Sd Sd Sd S)zxCheck for the old cache directory template {cache_dir}/{namespace}___{dataset_name}/{config_name}-xxx from 2.14 and 2.15r�r\r[r)�&_PACKAGED_DATASETS_MODULES_2_15_HASHES)�Picklerr��config_parametersrsc����hd���fd�t|�����D��}t��}|�|��|�|��|���S)z� Used to update hash of packaged modules which is used for creating unique cache directories to reflect different config parameters which are passed in metadata from readme. >rZr�r]c�$��i|] \}}|�v� ||�� SrRrR)rh�param�value�params_to_excludes �rTrwzcDatasetBuilder._check_legacy_cache2.<locals>.update_hash_with_config_parameters.<locals>.<dictcomp>�s5���)�)�)�$��u��$5�5�5��5�5�5�5rS)r�r�r)r�r�)r�r��params_to_add_to_hashr�rs @rT�"update_hash_with_config_parameterszODatasetBuilder._check_legacy_cache2.<locals>.update_hash_with_config_parameters�s���� %N�$M�$M�!�)�)�)�)�(.�/@�/F�/F�/H�/H�(I�(I�)�)�)�%� �H�H������������.�/�/�/��{�{�}�}�$rSr�rN�_legacy_no_dict_keys_sortingTr�r�r�rY)rPr�r<r�rlrqr�r�� utils._dillr�ryr�r�r�r�r �objectrrXr)r�r\r��builder_configs_parameters�metadata_configsr�r�r�r�r�r�) rar�r�r�rr�r�r�r�r�s rT�_check_legacy_cache2z#DatasetBuilder._check_legacy_cache2�s��� �O� &� &�{� 3� 3�+ 0�!�$�"6�7�7�+ 0���+�,�,� �j�/I�I�+ 0� Q� P� P� P� P� P� ,� ,� ,� ,� ,� ,� %�� %�QU� %�Z]� %� %� %� %� 7;�l�l�t�|�GY�GY�Z]�G^�G^�ab�Gb�Gb�� �*�*�3�/�/��2�2�hl�I���g�'E�t�L�L� i� i� �K�,�s�2�V�[�,�PT�P[�Pf�Ag�5h�5h�h� � i� i� i� i� i� i� i� i� i� i� i���� i� i� i� i�9�=�=�d�i��S�S�D��9�J� ��K�$��(Q�(b�b�b�9�9��.�C�T�UY�U`�Ue�f����(1�~�%.�%6��!�!�y�<`�<`�T�M^�<`�<`���� (�(� $� )�~�d�.B�D\�]�]� ��w�}�}�-�.�.� 0�/�/�W+ 0�+ 0�+ 0�+ 0�+ 0�+ 0�T 0� 0s� 1D � D�Dc�N�tj|�����S)a�Empty dict if doesn't exist Example: ```py >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('vivos') >>> ds_builder.get_all_exported_dataset_infos() {'default': DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None)} ``` )r+r��get_imported_module_dir��clss rT�get_all_exported_dataset_infosz-DatasetBuilder.get_all_exported_dataset_infos�s!�� �.�s�/J�/J�/L�/L�M�M�MrSc�~�|����|jjt ����S)aEmpty `DatasetInfo` if doesn't exist Example: ```py >>> from datasets import load_dataset_builder >>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> ds_builder.get_exported_dataset_info() DatasetInfo(description='', citation='', homepage='', license='', features={'speaker_id': Value(dtype='string', id=None), 'path': Value(dtype='string', id=None), 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None), 'sentence': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name='default', version=None, splits={'train': SplitInfo(name='train', num_bytes=1722002133, num_examples=11660, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=86120227, num_examples=760, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=1475540500, post_processing_size=None, dataset_size=1808122360, size_in_bytes=None) ``` )r r�rrXr*r�s rTr�z(DatasetBuilder.get_exported_dataset_info s/���2�2�4�4�8�8���9I�;�=�=�Y�Y�YrSc ��d}|�� |j�r|j�J|j�|j��}t�d|j�d|j����n�t|j��dkre|sbd|j p|j�d|jdj�d�}td t|j� ������d |�d �z���n7|jd}t�d |j�d|j����t|t��r\|j�|��}|�@|jr9td |�dt|j� ���������|sN|�||d<n|jr |s |j|d<d|vr!t|d��r|jr |j|d<|jdi|��}nk|rt%j|��n|}|���D]>\}}|�7t||��std|�d|�d����t+|||���?|jstd|j�����|�|jt1|j|j������|�||���}||jvo|dk} | rt�d|����ny|j|jvrL||j|jkr6tdt|j� ���������|jstd|j�d����||fS)aCreate and validate BuilderConfig object as well as a unique config id for this config. Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None. config_kwargs override the defaults kwargs in config Nz$No config specified, defaulting to: r�rzload_dataset('z', 'rz')zEConfig name is missing. Please pick one among the available configs: z Example of usage: `�`z6No config specified, defaulting to the single config: zBuilderConfig 'z' not found. Available: rXrZ�VERSIONzBuilderConfig z doesn't have a 'z' key.z$BuilderConfig must have a name, got r�r�)rrrWz Using custom data configuration zvCannot name a custom BuilderConfig the same as an available BuilderConfig. Change the name. Available BuilderConfigs: z must have a versionrR)�BUILDER_CONFIGS�DEFAULT_CONFIG_NAME�builder_configsr�r�r�r�rXr�r�r`�listrnr_ry�hasattrrr�r��deepcopyr��setattrr�r�rr�r�r�rZ) rar�rrrq�builder_config�example_of_usage�keyr�r�� is_customs rTr�z%DatasetBuilder._create_builder_configs7���� � �4�#7� ��'�3�!%�!5�!9�!9�$�:R�!S�!S��� � �l�4�CT�l�l�We�Wj�l�l�m�m�m�m��t�+�,�,�q�0�0�(��t�T�\�-N�T�=N�t�t�TX�Th�ij�Tk�Tp�t�t�t�)�)�r�NR�SW�Sg�Sl�Sl�Sn�Sn�No�No�r�r�L�9I�L�L�L�M���� �&*�%9�!�%<�N��K�K�z�QU�Qb�z�z�es�ex�z�z���� �k�3� '� '� �!�1�5�5�k�B�B�N��%�$�*>�%� �n�k�n�n�4�PT�Pd�Pi�Pi�Pk�Pk�Kl�Kl�n�n���� � 8��&�(3� �f�%�%��)� A�-� A�(,�(@� �f�%�� �-�-�'�$� �2J�2J�-�t�|�-�+/�<� �i�(�6�T�6�G�G��G�G�N�N�?L�_�T�]�>�:�:�:�Q_�N�+�1�1�3�3� 8� 8� ��U��$�"�>�3�7�7�h�(�)f�.�)f�)f�[^�)f�)f�)f�g�g�g��N�C��7�7�7���"� [��Y�N�DW�Y�Y�Z�Z� Z� �*�*��n�*���T�Ma�b�b�b� +� � � � #�3�3� �+�4� � � ��d�&:�:�V� �Y�@V� � � ]� �K�K�F�9�F�F� G� G� G� G��#�t�';�;�;�"�d�&:�>�;N�&O�O�O� �u�QU�VZ�Vj�Vo�Vo�Vq�Vq�Qr�Qr�u�u����"�)� ]� �![�.�2E�![�![�![�\�\�\��y�(�(rSc��d�|jD��}t|��t|j��kr#d�|jD��}td|�����|S)z@Dictionary of pre-defined configurations for this builder class.c��i|] }|j|�� SrR�rX�rhrs rTrwz2DatasetBuilder.builder_configs.<locals>.<dictcomp>vs��I�I�I�6�6�;��I�I�IrSc��g|] }|j�� SrRrr s rT� <listcomp>z2DatasetBuilder.builder_configs.<locals>.<listcomp>xs��C�C�C�V�V�[�C�C�CrSz5Names in BUILDER_CONFIGS must not be duplicated. Got )rr�r`)r �configs�namess rTrzDatasetBuilder.builder_configsqsj�� J�I�S�5H�I�I�I�� �w�<�<�3�s�2�3�3� 3� 3�C�C�s�/B�C�C�C�E��\�UZ�\�\�]�]� ]��rSc��|jSrf)r�r�s rTr�zDatasetBuilder.cache_dir|s ����rSc��|�|��p|���pd|_|���|_|j|_dSrf)rr�r�r�r�r�)rar�s rT�!_use_legacy_cache_dir_if_possiblez0DatasetBuilder._use_legacy_cache_dir_if_possible�sY�� � %� %�n� 5� 5� [��9Q�9Q�9S�9S� [�W[� �&��/�/�1�1����?����rSTc���|j� |r |r|jS|jr>|j�d��dkr |j�d��dnd}|�|jn |�d|j��}t j||j��}|r,t j|t|j j ����}|r;|j r4t|j t��rt j||j ��}|S)aRelative path of this dataset in cache_dir: Will be: self.dataset_name/self.config.version/self.hash/ or if a repo_id with a namespace has been specified: self.namespace___self.dataset_name/self.config.version/self.hash/ If any of these element is missing or if ``with_version=False`` the corresponding subfolders are dropped. Nr�rr�) r�r�r�r�r�r�r�r�ryrrZr�r_)ra� with_version� with_hashr��builder_data_dirs rT�_relative_data_dirz!DatasetBuilder._relative_data_dir�s�� � )� 5�,� 5�9� 5��1� 1�26�,�h�4�<�CU�CU�VY�CZ�CZ�]^�C^�C^�D�L�&�&�s�+�+�A�.�.�dh� �09�0A�4�,�,�)�Gk�Gk�X\�Xi�Gk�Gk��$�>�*:�D�N�K�K�� � Z�(�~�.>��D�K�DW�@X�@X�Y�Y� � � K��� K�z�$�)�S�'A�'A� K�(�~�.>�� �J�J� ��rSc ����tj|j|�d������tj|j|�d�����}�fd�}t ���s�|��}|ru|dd}||jjkrWdt|���d|j�d|j�d t|jj���d � }t� |��|S) z2Return the data directory for the current version.F)r)Tc���tj����sgSg}tj���D]<} |�t j|��|f���-#t$rY�9wxYw|�d���|S)z"Returns previous versions on disk.T)�reverse) r�r�r�r��appendrr�r`�sort)�version_dirnames�dir_namer+s �rT�_other_versions_on_diskz@DatasetBuilder._build_cache_dir.<locals>._other_versions_on_disk�s�����7�>�>�"2�3�3� �� �!� ��J�'7�8�8� � ���$�+�+�U�]�8�-D�-D�h�,O�P�P�P�P��!�����D����� � !� !�$� !� /� /� /�#� #s�)A&�& A3�2A3rzFound a different version z of dataset z in cache_dir z". Using currently defined version r�) r�r�r�r,r<rrZryr�r�r�)ra�version_data_dirr4� version_dirs� other_version�warn_msgr+s @rTr�zDatasetBuilder._build_cache_dir�s(���$�>�$�*>��@W�@W�ej�@W�@k�@k�l�l��$�>�$�*>��@W�@W�ei�@W�@j�@j�k�k�� $� $� $� $� $��-�.�.� -�2�2�4�4�L�� -� ,�Q��� 2� � �D�K�$7�7�7�7�S��5G�5G�7�7�UY�Uf�7�7�%)�%9�7�7��t�{�2�3�3�7�7�7�� �N�N�8�,�,�,��rSc��t�)a Construct the DatasetInfo object. See `DatasetInfo` for details. Warning: This function is only called once and the result is cached for all following .info() calls. Returns: info: (DatasetInfo) The dataset information ��NotImplementedErrorr�s rTr�zDatasetBuilder._info�s ��"�!rSc��tj�tjtj|������S)z8Return the path of the module of this class or subclass.)r�r��dirnamer��getfile� getmoduler s rTr z&DatasetBuilder.get_imported_module_dir�s-���w���w��w�/@��/E�/E�F�F�G�G�GrS�src�dstc�2�t|j||��dSrf)r(r�)rar@rAs rT�_renamezDatasetBuilder._rename�s���t�x��c�"�"�"�"�"rS�arrow� output_dirr�� download_mode�verification_moder�� file_format�max_shard_size�num_procc  �t ���|�|n�j}t|fi| pi��\} }| �_t�j��s|n�j�|���_t |p t j��}t|p tj ��}|�|n�j }|�|dvrtd|�d����|�_ �j� �j��dkr(td�j�d�j�jz�d����|�{|�At!�j|t jk|t jkd | �j�j� ��}t+�j|�jj|�jp|tjk� ��}t�j�� �|�_�r8t7�j��j�d d � ���jdz} �rt=| ��nt?j ��5�j�!tEj#�jt,j$����}|rw|t jkrgtJ�&d�j�d�j�d�����'���_&��(|�� ddd��dStJ�&d�j�d�j�d����r�tS�j&j*pdt7�j��j���s�tWdtY�j&j*pd���dtY�j&j-pd���dtY�j&j.pd���dtY�j&j/pd���d� ���t>j0��fd���}�j&j*r�tJ�&d�j�d�jj1�dtY�j&j-���dtY�j&j.���dtY�j&j/���dtY�j&j*���d�j�d���n[�r�j� �j��n�j}tJ�&d�j�d�jj1�d|�d�����2|��|�j��5}tg�d |��5d!|i}|�||d"<| �| |d#<�j4d)||d$�|�| ��tkd%��j&j6�7��D�����j&_.|�8���j&_9�j&j-�#�j&j.�j&j-z�j&_*��:��ddd��n #1swxYwYddd��n #1swxYwY��(|��tJ�&d&�j�d'�j�d(���ddd��dS#1swxYwYdS)*a[Downloads and prepares dataset for reading. Args: output_dir (`str`, *optional*): Output directory for the dataset. Default to this builder's `cache_dir`, which is inside `~/.cache/huggingface/datasets` by default. <Added version="2.5.0"/> download_config (`DownloadConfig`, *optional*): Specific download configuration parameters. download_mode ([`DownloadMode`] or `str`, *optional*): Select the download/generate mode, default to `REUSE_DATASET_IF_EXISTS`. verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`): Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...). <Added version="2.9.1"/> dl_manager (`DownloadManager`, *optional*): Specific `DownloadManger` to use. base_path (`str`, *optional*): Base path for relative paths that are used to download files. This can be a remote url. If not specified, the value of the `base_path` attribute (`self.base_path`) will be used instead. file_format (`str`, *optional*): Format of the data files in which the dataset will be written. Supported formats: "arrow", "parquet". Default to "arrow" format. If the format is "parquet", then image and audio data are embedded into the Parquet files instead of pointing to local files. <Added version="2.5.0"/> max_shard_size (`Union[str, int]`, *optional*): Maximum number of bytes written per shard, default is "500MB". The size is based on uncompressed data size, so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression for example. <Added version="2.5.0"/> num_proc (`int`, *optional*, defaults to `None`): Number of processes when downloading and generating the dataset locally. Multiprocessing is disabled by default. <Added version="2.7.0"/> storage_options (`dict`, *optional*): Key/value pairs to be passed on to the caching file-system backend, if any. <Added version="2.5.0"/> **download_and_prepare_kwargs (additional keyword arguments): Keyword arguments. Example: Download and prepare the dataset as Arrow files that can be loaded as a Dataset using `builder.as_dataset()`: ```py >>> from datasets import load_dataset_builder >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes") >>> builder.download_and_prepare() ``` Download and prepare the dataset as sharded Parquet files locally: ```py >>> from datasets import load_dataset_builder >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes") >>> builder.download_and_prepare("./output_dir", file_format="parquet") ``` Download and prepare the dataset as sharded Parquet files in a cloud storage: ```py >>> from datasets import load_dataset_builder >>> storage_options = {"key": aws_access_key_id, "secret": aws_secret_access_key} >>> builder = load_dataset_builder("cornell-movie-review-data/rotten_tomatoes") >>> builder.download_and_prepare("s3://my-bucket/my_rotten_tomatoes", storage_options=storage_options, file_format="parquet") ``` N)rD�parquetzUnsupported file_format: z. Expected 'arrow' or 'parquet'�z7Unable to download and prepare the dataset at the root z'. Please specify a subdirectory, e.g. '�'F)r��force_download� force_extract�use_etagrJr�r�)r�r�r[r��record_checksumsT)�parentsr�z _builder.lockzFound cached dataset z (�)zGenerating dataset r)� directoryzNot enough disk space. Needed: z (download: z , generated: z, post-processed: c3��K��s"�j�|d���|V�dS|dz}tj|d��� |V�tj�|��rt j|��t j||��tj�|��rt j|��dSdS#tj�|��rt j|��wwxYw)z4Create temporary dir for dirname and rename on exit.Tr�z .incompleteN) r�r�r�r�r��shutil�rmtree�mover�)r=�tmp_dir�is_localras ��rT�incomplete_dirz;DatasetBuilder.download_and_prepare.<locals>.incomplete_dirps����� �3��H�%�%�g��%�=�=�=�!�M�M�M�M�M�%� �5�G��K��$�7�7�7�7�3�%� � � ��7�=�=��1�1�3�"�M�'�2�2�2�� �G�W�5�5�5��7�>�>�'�2�2�3�"�M�'�2�2�2�2�2�3�3��2�7�>�>�'�2�2�3�"�M�'�2�2�2�2�3���s �A C�6C=z"Downloading and preparing dataset r�z , total: z) to z...� to r�rHrIrJ)r�rGc3�$K�|] }|jV�� dSrf)� num_bytes)rhr�s rTrkz6DatasetBuilder.download_and_prepare.<locals>.<genexpr>�s$����0h�0h�U���0h�0h�0h�0h�0h�0hrS�Dataset z downloaded and prepared to z(. Subsequent calls will reuse this data.rR);r�r r�r'�unstrip_protocolr�r�REUSE_DATASET_IF_EXISTSr=� BASIC_CHECKSr�r`r��_strip_protocol� RuntimeErrorr�rr��FORCE_REDOWNLOADr�r�rrr[r�� ALL_CHECKSr�r�parent�mkdirr;� contextlib� nullcontextr�r�r�r�r�r�� _load_info�"download_post_processing_resourcesrC� size_in_bytes�OSErrorrG� download_size� dataset_size�post_processing_size�contextmanagerrX�_check_manual_downloadrH�_download_and_prepare�sum�splitsr��get_recorded_sizes_checksums�download_checksums� _save_info)rarEr�rFrGr�r�rHrIrJr��download_and_prepare_kwargs�fsr�� data_existsr\�_dest�tmp_output_dir�prepare_split_kwargsr[s` @rT�download_and_preparez#DatasetBuilder.download_and_prepare�s�����j$.�#9�Z�Z�t�� �"�:�I�I�/�2G�R�I�I���J����-A�$�(�-K�-K�v�:�:�QU�QY�Qj�Qj�ku�Qv�Qv���$�]�%Z�l�6Z�[�[� �,�->�-_�BR�B_�`�`��!*�!6�I�I�D�N� � � "�{�:N�'N�'N��e��e�e�e�f�f� f�'��� �8� #� #�D�$4� 5� 5�� ;� ;��`�$�JZ�`�`�8<�8H�4�K\�8\�`�`�`��� � � ��&�"0�"�8�#0�L�4Q�#Q�"/�<�3P�"P�"�%��*�$(�$8�#�#�#��)�!�.� /���-�#�"&�"4�"h�8I�M]�Mh�8h� ���J�,�D�H�5�5�5��$��� � ;� ��!� "� "� )� /� /��t� /� L� L� L��(�?�:�I�%-� J�X�i� � � �*�2H�2J�2J�U �U ��(�/�/�)�.��9I�6�Kg�*h�*h�i�i�K�� �}� �0T�T�T�� � �\�D�4E�\�\��IY�\�\�\�]�]�]�!�O�O�-�-�� ��7�7� �C�C�C��U �U �U �U �U �U �U �U � �K�K�V�d�.?�V�V�4�CS�V�V�V� W� W� W�� �0��I�+�0�q�D��AQ�<R�<R�<Y�����"�M�(�4�9�CZ�C_�^_�:`�:`�M�M�nv�w{�xA�xO�xT�ST�oU�oU�M�M�dl�mq�mv�mC�mH�GH�dI�dI�M�M�]e�fj�fo�fD�fI�HI�]J�]J�M�M�M����� &� 3� 3� 3� 3� 3�'� &� 3�*�y�&� w�� � �\��9J�\�\�T�[�M]�\�\�"*�4�9�+B�"C�"C�\�\�RZ�[_�[d�[q�Rr�Rr�\�\�'/�� �0N�'O�'O�\�\�'�t�y�'>�?�?�\�\�GK�FV�\�\�\�����GO�d���0�0��1A�B�B�B�TX�Td��� � �u��AR�u�u�UY�U`�Ue�u�u�kp�u�u�u�v�v�v� � '� '� � 3� 3� 3� ��� 0�1�1� &�^�*�$� �~�N�N�&�&�,9�;�+G�(�%�1�AO�,�-=�>��+�;C�,�Z�8�.�D�.��#-�*;���/��6� ���.1�0h�0h�d�i�N^�Ne�Ne�Ng�Ng�0h�0h�0h�-h�-h�D�I�*�3=�3Z�3Z�3\�3\�D�I�0��y�.�:�26�)�2H�4�9�Kb�2b�� �/��O�O�%�%�%�%&�&�&�&�&�&�&�&�&�&�&����&�&�&�&� &� &� &� &� &� &� &� &� &� &� &���� &� &� &� &�0 � 3� 3�J� ?� ?� ?� �K�K�:�4�,�:�:�$�JZ�:�:�:� � � �eU �U �U �U �U �U �U �U �U �U �U �U ����U �U �U �U �U �U sd�?B)X-�5H&X-�W�-CV>�2 W�>W �W�W �W� X-�W �X-�W �AX-�-X1�4X1c ���|j�U|j�Pttjd|j�d|jj�d|j�d|jp|j�d� �����dSdS)Nz The dataset z with config zp requires manual data. Please follow the manual download instructions: za Manual data can be loaded with: datasets.load_dataset("z$", data_dir="<path/to/manual/data>")) r�� manual_dirr%�textwrap�dedentr�rrXr��rar�s rTrtz%DatasetBuilder._check_manual_download�s��� � ,� 8�Z�=R�=Z�%���w�!%�!2�w�w�AE��AQ�w�w��7�w�w� .2�\�-N�T�=N� w�w�w��� � � � 9� 8�=Z�=ZrSc ���t|j���}|�|��}|j|fi|��}|tjkr4|jr-t|jj |� ��d��|D�]}t|j j �����dkrtd���t �d|j j �d���|�|j �� |j|fi|��nr#t&$r2}t'd|jpdzd zt|��z��d �d }~wt*$r+}t+|j|jd |j �d �� ��d �d }~wwxYw|�����|tjks|tjkrt5|jj|��||j_|j|j_d S)aDownloads and prepares dataset for reading. This is the internal implementation to overwrite called when user calls `download_and_prepare`. It should download all required data and generate the pre-processed datasets files. Args: dl_manager ([`DownloadManager`]): `DownloadManager` used to download and cache data. verification_mode ([`VerificationMode`]): if `ALL_CHECKS`, perform all the verifications including checksums. if `BASIC_CHECKS`, do not perform checksums, only perform split tests. if `NO_CHECKS`, do not perform any verification. prepare_split_kwargs: Additional options, such as `file_format`, `max_shard_size` )r�zdataset source filesroz{`all` is a special split keyword corresponding to the union of all splits, so cannot be used as key in ._split_generator().� Generating � splitzCannot find data file. rMz Original error: Nz7To avoid duplicate keys, please fix the dataset script z.py)�fix_msg)r4r��_make_split_generators_kwargs�_split_generatorsr=rgrRr?r�ryrxry� split_inforX�lowerr`r��add�_prepare_splitror�r0r�duplicate_key_indices�manage_extracted_filesrcr@rw�downloaded_sizerp) rar�rGr�� split_dict�split_generators_kwargs�split_generators�split_generator�es rTruz$DatasetBuilder._download_and_prepare�sT��"�D�,=�>�>�>� �"&�"D�"D�EY�"Z�"Z��1�4�1�*�X�X�@W�X�X�� � 0� ;� ;� ;� �@[� ;� �� �,�j�.U�.U�.W�.W�Yo� � � � 0� 0� 0�O��?�-�2�3�3�9�9�;�;�u�D�D� �+���� �K�K�M�o�&@�&E�M�M�M� N� N� N� �N�N�?�5� 6� 6� 6� �#��#�O�L�L�7K�L�L�L�L��� � � ��-��8�>�B�@�+�,��!�f�f���� � �����'� � � �)��E��+�d�VZ�V_�d�d�d����� ����� ���� � -� -� /� /� /� /� � 0� =� =� =�AR�Vf�Vq�Aq�Aq� �$�)�*�J� 7� 7� 7�&�� ��",�"<�� ���s$�D� F�-E � F�&E<�<Fc�@�|jjpgD�] }|�|�����D]�\}}t |j��rt d|j�����tj|vrtd|�����tj � |j |��}tj � |��sN|�|||��}|r5t�d|�d|����t!j||�����dS)Nz/Post processing is not supported on filesystem �+Resources shouldn't be in a sub-directory: z$Downloaded post-processing resource z as )r�rw�_post_processing_resourcesr�r'r�r;r��sepr`r�r�r�r��#_download_post_processing_resourcesr�rWrY)rar�r�� resource_name�resource_file_name� resource_path�downloaded_resource_paths rTrmz1DatasetBuilder.download_post_processing_resourcessE���Y�%�+�� M� M�E�59�5T�5T�UZ�5[�5[�5a�5a�5c�5c� M� M�1� �1�/���9�9�l�-�.j�`d�`h�.j�.j�k�k�k��6�/�/�/�$�%g�Se�%g�%g�h�h�h� "�� � �T�-=�?Q� R� R� ��w�~�~�m�4�4�M�/3�/W�/W��}�j�0�0�,�0�M�� � �$r�=�$r�$r�^p�$r�$r�s�s�s�� �$<�m�L�L�L�� M� M� MrSc�L�tj|j|jj���S)N�r�)r*r�r�r�r�r�s rTrlzDatasetBuilder._load_infos ���)�$�*:�D�H�Ld�e�e�e�erSc��t|j��st|jdz��nt j��}|5|j�|j|jj���ddd��dS#1swxYwYdS)Nz _info.lockr�) r'r�r;r�rjrkr��write_to_directoryr��ra� file_locks rTrzzDatasetBuilder._save_infos���(���1�1� *�H�T�%� �4� 5� 5� 5��'�)�)� � � e� e� �I� (� (��)9�4�8�Kc� (� d� d� d� e� e� e� e� e� e� e� e� e� e� e� e���� e� e� e� e� e� es�,A:�:A>�A>c�<�t|j��st|jdz��nt j��}|5t di|jj|j i��� |� ����ddd��dS#1swxYwYdS)Nz _infos.lockrR) r'r�r;r�rjrkr+rrXr�r�r r�s rT� _save_infoszDatasetBuilder._save_infoss���(���1�1� *�H�T�%� �5� 6� 6� 6��'�)�)� � � q� q� � =� =�� � 0�$�)�<� =� =� P� P�QU�Qm�Qm�Qo�Qo� p� p� p� q� q� q� q� q� q� q� q� q� q� q� q���� q� q� q� q� q� qs�AB�B�Bc��~iS)zFGet kwargs for `self._split_generators()` from `prepare_split_kwargs`.rR)rar�s rTr�z,DatasetBuilder._make_split_generators_kwargs&s �� �� rSFr�c���|j�|jdkrtd���t|j��r*t dt |j��j�d����tj� |j ��s td|j �d|j �d����t�d |pd �|jj���d |j ����|�d �|jjD��}t%|p t$j��}t)t+|j|||� ��|dd���}t/|t0��rt3|��}|S)aReturn a Dataset for the specified split. Args: split (`datasets.Split`): Which subset of the data to return. run_post_process (`bool`, defaults to `True`): Whether to run post-processing dataset transforms and/or add indexes. verification_mode ([`VerificationMode`] or `str`, defaults to `BASIC_CHECKS`): Verification mode determining the checks to run on the downloaded/processed dataset information (checksums/size/splits/...). <Added version="2.9.1"/> in_memory (`bool`, defaults to `False`): Whether to copy the data in-memory. Returns: datasets.Dataset Example: ```py >>> from datasets import load_dataset_builder >>> builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes') >>> builder.download_and_prepare() >>> ds = builder.as_dataset(split='train') >>> ds Dataset({ features: ['text', 'label'], num_rows: 8530 }) ``` NrDzELoading a dataset not written in the "arrow" format is not supported.zLoading a dataset cached in a z is not supported.r`z: could not find data in z�. Please make sure to call builder.download_and_prepare(), or use datasets.load_dataset() before trying to access the Dataset object.zConstructing Dataset for split z, z, from c��i|]}||��SrRrR)rh�ss rTrwz-DatasetBuilder.as_dataset.<locals>.<dictcomp>bs��4�4�4�a�Q��4�4�4rS)�run_post_processrG� in_memoryT)� map_tuple� disable_tqdm)r�r$r'r�r;�typerOr�r�r�r��FileNotFoundErrorr�r��debugr�r�rwr=rcrEr�_build_single_datasetr_r�r)rar�r�rGr��datasetss rT� as_datasetzDatasetBuilder.as_dataset+s���P � � (�T�->�'�-I�-I�!�"i�j�j� j� ��� )� )� t�%�&r�t�D�H�~�~�G^�&r�&r�&r�s�s� s��w�~�~�d�.�/�/� �#�V�4�,�V�V�t�GW�V�V�V��� � � � �v�u�7[�� � �$�)�JZ�@[�@[�v�v�dh�dt�v�v�w�w�w� �=�4�4�4�9�#3�4�4�4�E�,�->�-_�BR�B_�`�`��� ��*�!1�"3�#�  � � � ���  �  �  �� �h�� %� %� -�"�8�,�,�H��rSr�r�c����t|t��sUt|��}|dkr1d��jj�����}t|��}��||���}|�r�� |��� ��D]"}tj |vrtd|������#�fd��� |�����D��}��||��}|��]|}i} d} |���D]\} } t!| ��} | | | <�|t"jkrW| rU�jj��jjj�d}n$�jjj�|��}t-|| d���jj�t/���j_�jjj�i�jj_| �jjjt|��<t1d ��jjj� ��D�����j_�jj�<�jj�0�jj�jjz�jjz�j_�����jj|j_�jj|j_�jj|j_�jjj�h�jjjj |jj kr)td �jjj�d |j������jjj|j_|S) zas_dataset for a single split.ro�+)r�r�r�c�b��i|]+\}}|tj��j|����,SrR)r�r�r�r�)rhr�r�ras �rTrwz8DatasetBuilder._build_single_dataset.<locals>.<dictcomp>�sD������5�M�#5��r�w�|�|�D�,<�>P�Q�Q���rSNFzpost processing resourcesc3�TK�|]#}|���D] }|dV�� �$dS)r_N)r�)rh�split_checksums_dicts�checksums_dicts rTrkz7DatasetBuilder._build_single_dataset.<locals>.<genexpr>�s^����5�5�-�*?�*F�*F�*H�*H�5�5�'�#�;�/�5�5�5�5�5�5�5rSz:Post-processed features info don't match the dataset: Got z but expected something like )!r_rryr�r�rwrnr3� _as_datasetr�r�r�r�r`r�� _post_processr>r=rg�post_processed�resources_checksumsr�r?r,rvrrrqrprnrzr�r�r�)rar�r�rGr��dsr��resources_pathsr��recorded_checksumsrRr�r�� size_checksum�expected_checksumss` rTr�z$DatasetBuilder._build_single_datasetvs�����%��1�1� !���J�J�E���~�~������!1�!6�!6�!8�!8�9�9���%�L�L�E�� � ���� � �� �. M�&*�&E�&E�e�&L�&L�&S�&S�&U�&U� i� i�"��6�/�/�/�$�%g�Se�%g�%g�h�h�h�0�����9=�9X�9X�Y^�9_�9_�9e�9e�9g�9g����O�"�/�/��O�D�D�N��)�#��%'�"�#(� �4C�4I�4I�4K�4K�F�F�0�M�=�$:�=�$I�$I�M�8E�&�}�5�5�$�(8�(C�C�C�HX�C��y�/�7�4�9�;S�;g�;o�-1�*�*�-1�Y�-E�-Y�-]�-]�^c�-d�-d�*�$�%7�9K�Mh�i�i�i��9�+�3�/@�/B�/B�D�I�,��9�+�?�G�CE�D�I�,�@�K]�� �(�<�S��Z�Z�H�14�5�5�15��1I�1]�1d�1d�1f�1f�5�5�5�2�2�� �.� �9�)�5�$�)�:Q�:]�� �.���1H�H�4�9�Ki�i��I�+����!�!�!�*.�)�*B���'�04� �0N���-�)-��)@���&��9�+�4�@��y�/�8�=���AQ�Q�Q�(�k�[_�[d�[s�[|�k�k�^`�^i�k�k����,0�9�+C�+L���(�� rSc�Z�|j�|j��}|j}|���r|j}t ||j���|||jj � ��|���}|� |��}tdd|i|��S)a�Constructs a `Dataset`. This is the internal implementation to overwrite called when user calls `as_dataset`. It should read the pre-processed datasets files and generate the `Dataset` object. Args: split (`datasets.Split`): which subset of the data to read. in_memory (`bool`, defaults to `False`): Whether to copy the data in-memory. Returns: `Dataset` )rX� instructions� split_infosr�� fingerprintrR) r�rdr�r�r�rXrr��readrwr��_get_dataset_fingerprintr)rar�r�r�r��dataset_kwargsr�s rTr�zDatasetBuilder._as_dataset�s��� �H�,�,�T�-=�>�>� ��(� � � #� #� %� %� %��9�L�$�Y�� �:�:�?�?���� �(�/�/�1�1�� @� � �� �3�3�E�:�:� ��A�A�;�A�.�A�A�ArSc��t��}|�t|����������|�t |����|���}|S)z�The dataset fingerprint is the hash of the relative directory dataset_name/config_name/version/hash, as well as the split specs.)r)r�rr,r�ryr�)rar��hasherr�s rTr�z'DatasetBuilder._get_dataset_fingerprint�sl������� � �d�4�2�2�4�4�5�5�>�>�@�@�A�A�A�� � �c�%�j�j�!�!�!��&�&�(�(� ��rSc�L�t|j��r*tdt|j��j�d����t |p|jt|j|j ���|j |j j ���}|� |��d�|�|��D��}|�|}n/||vr ||}n"td|�dt!|�������t#|j|d� ��}t'|t(��rt+|��}|S) Nz(Loading a streaming dataset cached in a z is not supported yet.r�)r�r�r�r[c��i|] }|j|�� SrRr)rh�sgs rTrwz7DatasetBuilder.as_streaming_dataset.<locals>.<dictcomp>�s��V�V�V�R�R�W�b�V�V�VrSz Bad split: z. Available splits: T)r�)r'r�r;r�rOr r�rr�r�r�rr[rtr�r`rrE�_as_streaming_dataset_singler_r�r)rar�r�r��splits_generators�splits_generatorr�s rT�as_streaming_datasetz#DatasetBuilder.as_streaming_dataset�sT�� ��� )� )� �%�j�4���>�>�;R�j�j�j��� �.��1�4�>�*���T�Ma�b�b�b��*��[�)�  � � � � �#�#�J�/�/�/�V�V�4�3I�3I�*�3U�3U�V�V�V�� �=�0� � � �'� '� '�0��7� � ��_�5�_�_�d�K\�F]�F]�_�_�`�`� `�� � -� �� � � �� �h�� %� %� 5�*�8�4�4�H��rSc��|�|��}|jr|j|jini}t||j|j|���S)N)r�r��token_per_repo_id)� _get_examples_iterable_for_splitr�r�r/r�rX)rar�� ex_iterabler�s rTr�z+DatasetBuilder._as_streaming_dataset_singlesZ���;�;�<L�M�M� �:>�,�N�T�\�4�:�6�6�B��� �d�i�/?�/D�Xi� � � � rS�datasetr�c��dS)z%Run dataset transforms or add indexesNrR)rar�r�s rTr�zDatasetBuilder._post_processr�rSc��iS)z+Mapping resource_name -> resource_file_namerR)rar�s rTr�z)DatasetBuilder._post_processing_resourcess��� rSr�c��dS)zPDownload the resource using the download manager and return the downloaded path.NrR)rar�r�r�s rTr�z2DatasetBuilder._download_post_processing_resourcess ���trSc��t���)a�Specify feature dictionary generators and dataset splits. This function returns a list of `SplitGenerator`s defining how to generate data and what splits to use. Example: return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={'file': 'train_data.zip'}, ), datasets.SplitGenerator( name=datasets.Split.TEST, gen_kwargs={'file': 'test_data.zip'}, ), ] The above code will first call `_generate_examples(file='train_data.zip')` to write the train data, then `_generate_examples(file='test_data.zip')` to write the test data. Datasets are typically split into different subsets to be used at various stages of training and evaluation. Note that for datasets without a `VALIDATION` split, you can use a fraction of the `TRAIN` data for evaluation as you iterate on your model so as not to overfit to the `TEST` data. For downloads and extractions, use the given `download_manager`. Note that the `DownloadManager` caches downloads, so it is fine to have each generator attempt to download the source data. A good practice is to download all data in this function, and then distribute the relevant parts to each split with the `gen_kwargs` argument Args: dl_manager (`Union[DownloadManager, StreamingDownloadManager]`): Download manager to download the data Returns: `list<SplitGenerator>`. r:r�s rTr�z DatasetBuilder._split_generatorss��Z"�#�#�#rSr�c ��t���)a�Generate the examples and record them on disk. Args: split_generator (`SplitGenerator`): Split generator to process file_format (`str`, *optional*): format of the data files in which the dataset will be written. Supported formats: "arrow", "parquet". Default to "arrow" format. max_shard_size (`Union[str, int]`, *optional*): Maximum number of bytes written per shard, default is "500MB". The size is based on uncompressed data size, so in practice your shard files may be smaller than `max_shard_size` thanks to Parquet compression for example. num_proc (`int`, *optional*, defaults to `None`): Number of processes when downloading and generating the dataset locally. Multiprocessing is disabled by default. <Added version="2.7.0"/> **kwargs: Additional kwargs forwarded from _download_and_prepare r:)rar�rHrIrJ�kwargss rTr�zDatasetBuilder._prepare_splitL���8"�#�#�#rSc��t���)z�Generate the examples on the fly. Args: split_generator (`SplitGenerator`): Split generator to process r:�rar�s rTr�z/DatasetBuilder._get_examples_iterable_for_splitjs��"�#�#�#rS) NNNNNNNNNNNNN)NN)r�rL)TT) NNNNNNrDNNN)NTNF)F�rDNN)OrOrPrQr�rrVr�rrr�r ryr*r&r rzrr�rr{r�r�r��propertyr�r�r� classmethodr+r r��tupler�rArFrr�r'r,r��abc�abstractmethodr�r rCrrr=rr�rtrurmrlrzr�r�r3rrr�rr��TRAINr�r�r/r�r�rr�r�r�r r�r5r�r.r�rRrSrTr�r��s ������F�F�R�G�)���O��� !%��$(�&*�%)�"�#'�&*�'+�,0�!%�FJ�"&�*.�+/�s3�s3��C�=�s3��s�m�s3��c�]� s3� �s�m� s3� �C�=� s3��{�#�s3��8�$�s3���d�C�i�(�)�s3��#��s3��U�3��d�M�#A�B�C�s3��3�-�s3�"�$��s3�$�C�=�s3�s3�s3�s3�j���3�3�3���h�s�m�����X��0�X�c�]�0�0�0�0�.-0�?�-0�x�PS�}�-0�-0�-0�-0�^� N�/?� N� N� N��[� N� Z�;� Z� Z� Z� Z�15�X)�X)� �}�c�!� "�X)�X)�X)�X)�t�� �W�Y�Y���S�-�%7� 8�����Y��[��]������X��+�+�+�+� � �s� � � � �(  �  �  �D �� "�{� "� "� "��� "��H�H��[�H�#�3�#�S�#�#�#�#� %)�48�<@�DH�04�#'�"�48�"&�*.�_�_��S�M�_�"�.�1�_� ��l�C�&7� 8�9� _� $�E�*:�C�*?�$@�A� _� �_�-� _��C�=�_��_�!��s�C�x��1�_��3�-�_�"�$��_�_�_�_�B � � �?=�?=�?=�BM�M�M� f�K�f�f�f�f�e�e�e�q�q�q����"&��DH�� I�I����I�$�E�*:�C�*?�$@�A� I� �w� �#� $� I�I�I�I�` � C�C��S�/�5�0�1�C��C�,� C� � C�C�C�C�JBG��`e�B�B����'=�!>�B�Y]�B�jq�B�B�B�B�:�e�O�U�4J�.K��PS����� $�#'�"�"���}�"��C�=�"� �t�C��(�)�?�:� ;� "�"�"�"�H  � �  �  �  �  ��W��w�s�C�x�?P��U]�^e�Uf���������S�#�X���������),��:I�� �#������  ��,$�E�/�C[�2[�,\�,$�,$�,$���,$�\ ��#�48�"&� $�$�'�$��$�!��s�C�x��1� $� �3�-� $�$�$���$�:$��$�Sc�$�$�$�$�$�$rSr�c����eZdZdZejd���Z ddedede e de e e e ffd �Z d ed e d e de d edede deee ee e efffd�Z�fd�Zdedefd�Z�xZS)�GeneratorBasedBuilderawBase class for datasets with data generation based on dict generators. `GeneratorBasedBuilder` is a convenience class that abstracts away much of the data writing and reading of `DatasetBuilder`. It expects subclasses to implement generators of feature dictionaries across the dataset splits (`_split_generators`). See the method docstrings for details. c ��t���)agDefault function generating examples for each `SplitGenerator`. This function preprocess the examples from the raw data to the preprocessed dataset files. This function is called once for each `SplitGenerator` defined in `_split_generators`. The examples yielded here will be written on disk. Args: **kwargs (additional keyword arguments): Arguments forwarded from the SplitGenerator.gen_kwargs Yields: key: `str` or `int`, a unique deterministic example identification key. * Unique: An error will be raised if two examples are yield with the same key. * Deterministic: When generating the dataset twice, the same example should have the same key. Good keys can be the image id, or line number if examples are extracted from a text file. The key will be hashed and sorted to shuffle examples deterministically, such as generating the dataset multiple times keep examples in the same order. example: `dict<str feature_name, feature_value>`, a feature dictionary ready to be encoded and written to disk. The example will be encoded with `self.info.features.encode_example({...})`. r:�rar�s rT�_generate_examplesz(GeneratorBasedBuilder._generate_examples}s��:"�#�#�#rSrDNr��check_duplicate_keysrJrIc ������� �t|p tj��}�jj��jj|j}n|j}d}�j�d|j�|�d|��}tj �j |���|r}|dkrwt|j ��} | dkr)t�d|�d|j�d���d}n4| |kr.t�d|�d| �d |j�d | �d � ��| }td |jd |j�d����} �||||d��|�|dkrvd} |j } d} | 5�jd'| | d����D] \} }}|r|} � | �|���! ddd��n #1swxYwY| � Jd���d�| D��\}}}�}n��fd�t't)|j |�����D��}t+|��}dg|z}dg|z}dg|z}dg|z�dg|z}t-|��5}| 5t/|�j|���D]5\} }}|r|\|| <|| <|| <�| <|| <� | �|���6 ddd��n #1swxYwYddd��n #1swxYwYd|vsJd|�d����t1���� t1|��}t1|��}|d}||j_||j_t�d� �d ���� dkr^dt6t8f���� fd� }d�t'���D��}t;||dd� ��d!�|D��|j_n[d"\}} ���� d#|d$���� d%| d$����� |d&�����jj!�|�j_!dSdS)(N�-JJJJJ-SSSSS-of-NNNNNr�r�r�Setting num_proc from � back to 1 for the �@ split to disable multiprocessing as it only contains one shard.r]� for the � split as it only contains � shards.� examplesr�r���unit�total�desc)�fpathrHrIr�r�r�� gen_kwargs�job_id�-Failed to retrieve results from prepare_splitc3�K�|]}|gV��dSrfrR�rh�items rTrkz7GeneratorBasedBuilder._prepare_split.<locals>.<genexpr>��=����h�h����h�h�h�h�h�hrSc�$��g|] \}}||d����� S�rrR�rhrr�_prepare_split_argss �rTr"z8GeneratorBasedBuilder._prepare_split.<locals>.<listcomp>��;������&�F�J� *�V�S�S�?R�S���rS�� max_num_jobs��kwargs_iterable�;Failed to retrieve results from prepare_split: result list �G still contains None - at least one worker failed to return its results� Renaming � shard_and_jobc�"��|\}}t�d|���|z}����d|d����d|d�����d|d����d�d�����dS�N�SSSSS�05d�JJJJJz JJJJJ-SSSSS�NNNNN�rvrCr�)r�shard_idr�global_shard_idr�ra�shards_per_job� total_shardss ����rT� _rename_shardz;GeneratorBasedBuilder._prepare_split.<locals>._rename_shard s����#0� ��&�"%�n�W�f�W�&=�">�">��"I��� � ��M�M�'�h�+<�+<�=�=�E�E�g�RX���_�_��M�M�-�O�1I�1I�J�J�R�R�SZ�_k�\q�\q�r�r�����rSc�@�g|]\}}t|��D]}||f���SrR��range�rhr� num_shardsrs rTr"z8GeneratorBasedBuilder._prepare_split.<locals>.<listcomp>sP�����&�F�J� %�j� 1� 1�����6�"����rST�@��disable� max_workersc��g|] }|D]}|��� SrRrR�rh� shard_lengths� shard_lengths rTr"z8GeneratorBasedBuilder._prepare_split.<locals>.<listcomp>�:��8�8�8�!.�\i�8�8�LX� �8�8�8�8rS�rrrrrrMrR)"rBr�MAX_SHARD_SIZEr�rwrXr�r�r�r�r�rIrr�r��hf_tqdm� num_examples�_prepare_split_singler�� enumeraterJr�r rDrvr_r�r�r{rr,rCr�r�)!rar�r�rHrJrIr��SUFFIX�fname�num_input_shards�pbar�resultrr�done�content�examples_per_job� bytes_per_job�features_per_job�shard_lengths_per_job�kwargs_per_job�num_jobs�pool�total_num_examples�total_num_bytesr�r �shards_and_jobsrr r�rrs!` @@@@rTr�z$GeneratorBasedBuilder._prepare_split�s��������2�.�2Y�F�DY�Z�Z�� �9� � '���)�/�*>�?�J�J�(�3�J�(���$�S�S��';�S�V�S�S�k�S�S����t�/��7�7�� � ,��1� � �>��?Y�Z�Z� ��1�$�$����\�X�\�\�*�/�\�\�\�������!�H�,�,����]�X�]�]�;K�]�]�V`�Ve�]�]�CS�]�]�]����,�����)�6�z��6�6�6� � � ���&�,�$�$8�  � �� � �x�1�}�}��F�(�3�J��F�� -� -�-G�T�-G�.�)�&�.�.�<O�.�.�-�-�)�F�D�'��-�!(���� � �G�,�,�,�,� -� -� -� -� -� -� -� -� -� -� -� -���� -� -� -� -��%�%�'V�%�%�%�h�h�#)�h�h�h� d� �m�-=�~�Od�Od�����*3�%�o�&@�x�X�X�X�+�+����N� �>�*�*�H� $�v��0� �!�F�X�-�M� $�v��0� �"�V�h�.�N�%)�F�X�$5� !��h��� 1�4��1�1�1C��d�8�.�2�2�2�1�1�-���g� � 1�!(� � 0�� 8� -�f� 5� 0�� 8� .�v� 6� 5�f� =� =�!�K�K��0�0�0�0�1�1�1�1�1�1�1�1�1�1�1�1����1�1�1�1� 1� 1� 1� 1� 1� 1� 1� 1� 1� 1� 1���� 1� 1� 1� 1�$�/�/�/�/�h�N^�h�h�h�0�/�/��>�*�*� � �!1�2�2���m�,�,��#�A�&��2D��"�/�/>��"�,� � � �7��7�7�7�8�8�8� �!� � � �U�3�Z� � � � � � � � � ���*3�N�*C�*C����O� �}�o�t�QS� T� T� T� T�8�8�2G�8�8�8�O� &� 4� 4� $� �H�f� �L�L�� � �g�(�'8�'8�9�9�A�A�'�f�?�?�[�[�� � �f�b�)�)� � � � �9� � %�!)�D�I� � � � &� %sI�:2E9�9E=�E=�J�AI,� J�,I0 �0J�3I0 �4J�J� Jrr�rHr�rrsc #��K�|jd i|��}|dkrtnt} |dk} g} d\} } d}d} | |jj|�d|d����d|d���|j|j||jj | ���} tj ��}|D�]A\}}|��|j |kr�|� ��\}}|� ��| �|��| |z } | |z } |dz }| |j|�d|d����d|d���|j|j||jj | ���}|jj�|jj�|��n|}|�||��|dz }tj ��|t$jzkrtj ��}|d |fV�d}��C |d |fV�|dz}|� ��\}}|� ��| �|��| |z } | |z } n[#|d |fV�|dz}|� ��\}}|� ��| �|��| |z } | |z } wxYwnE#t($r8}t+|t,��r|j�|j}t1d ��|�d}~wwxYw|d | | |j|| ffV�dS) NrLr/rrrr)r�r�r�� hash_salt�check_duplicatesr��embed_local_filesrF�.An error occurred while generating the datasetTrR)r�rrr�r�r�r�rXr�r��time� _num_bytes�finalize�closer0� _features�encode_example�writer�PBAR_REFRESH_TIME_INTERVAL� Exceptionr_r� __context__r#)rarr�rHrIr�r�r� generator� writer_classrIr,rCrDr�num_examples_progress_update�writer�_timer�recordr2r_�exampler%r�s rTr3z+GeneratorBasedBuilder._prepare_split_single*s�����,�D�+�9�9�j�9�9� �(3�y�(@�(@�}�}�k� �'�9�4��� �.2�+��O���'(�$�0 b�!�\���+��]�]�7�x�,=�,=�>�>�F�F�w�SY�P_�P_�`�`�"&�"9�$�/�!5� $�� 8�"3����F�! -�� � � ��#,�9�9�K�C��%�1�f�6G�.�6X�6X�28�/�/�2C�2C�/� �i�� � ����%�,�,�\�:�:�:�*�l�:�*�'�9�4�� �A� ��!-��%+�%5�!&���w�8�8I�8I�!J�!J�!R�!R�SZ�_e�\k�\k�!l�!l�.2�.E�&0�o�-A�,0�H�,D�.?�"�"�"��LP�9�K]�Ki�d�i�0�?�?��G�G�G�ou�G��L�L��#�.�.�.�0�A�5�0��y�{�{�U�V�-N�%N�N�N� $� � � ��$�e�-I�I�I�I�I�78�4��/9�2�e�%A�A�A�A�A�%��\� �*0�/�/�*;�*;�'� �i�� � �����$�$�\�2�2�2�"�l�2�"��9�,���� �e�%A�A�A�A�A�%��\� �*0�/�/�*;�*;�'� �i�� � �����$�$�\�2�2�2�"�l�2�"��9�,��������� b� b� b��!�1�2�2� "�q�}�7P��M��(�)Y�Z�Z�`a� a�����  b���� �d�/��&�BR�T^�`m�n�n�n�n�n�n�ns3�AJ�EI�,AJ�AJ�J� K!�)3K�K!c �~��t��j||fd|tjkp|tjki|��dS)Nr�)�superrur=rcrg)rar�rG�prepare_splits_kwargs� __class__s �rTruz+GeneratorBasedBuilder._download_and_preparepsb���%����%� � � � �"3�6F�6S�!S�"@� �$4�$?�?�  � $�  � � � � rSc�6�t|j|j��Srf)r.r�rr�s rTr�z6GeneratorBasedBuilder._get_examples_iterable_for_splitys���� 7��9S�T�T�TrSr�)rOrPrQr�r�r�r�r5rzr r{r ryr�r�r6rr�r3rur.r�� __classcell__)r_s@rTr�r�ts���������� ��$�$���$�D�"&�48� L*�L*�'�L*�#�L*� �3�-� L*� !��s�C�x��1� L*�L*�L*�L*�\Do��Do��Do�� Do� � Do� � Do�#�Do��Do� �%��T�5��e��#4�4�5� 6�Do�Do�Do�Do�L � � � � �U��U�Sc�U�U�U�U�U�U�U�UrSr�c���eZdZdZejd���Z ddedede e de e ee ffd �Z d e d edede d e d eee ee e efff d�Zded efd�ZdS)�ArrowBasedBuilderzaBase class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet).c ��t���)a�Default function generating examples for each `SplitGenerator`. This function preprocess the examples from the raw data to the preprocessed dataset files. This function is called once for each `SplitGenerator` defined in `_split_generators`. The examples yielded here will be written on disk. Args: **kwargs (additional keyword arguments): Arguments forwarded from the SplitGenerator.gen_kwargs Yields: key: `str` or `int`, a unique deterministic example identification key. * Unique: An error will be raised if two examples are yield with the same key. * Deterministic: When generating the dataset twice, the same example should have the same key. Good keys can be the image id, or line number if examples are extracted from a text file. The key will be hashed and sorted to shuffle examples deterministically, such as generating the dataset multiple times keep examples in the same order. example: `pyarrow.Table`, a feature table ready to be encoded and written to disk. r:r�s rT�_generate_tablesz"ArrowBasedBuilder._generate_tables�r�rSrDNr�rHrJrIc ��������t|p tj��} �jj|j}n#t $r |j}YnwxYwd}�j�d|j�|�d|��}tj �j |���|r}|dkrwt|j ��}|dkr)t�d|�d|j�d���d}n4||kr.t�d|�d|�d |j�d |�d � ��|}t!d |jd |j�d����} �||d��|�|dkrvd} |j } d} | 5�jd'| | d����D] \} } }| r|} � | �|���! ddd��n #1swxYwY| � Jd���d�| D��\}}}�}n��fd�t)t+|j |�����D��}t-|��}dg|z}dg|z}dg|z}dg|z�dg|z}t/|��5}| 5t1|�j|���D]5\} } }| r|\|| <|| <|| <�| <|| <� | �|���6 ddd��n #1swxYwYddd��n #1swxYwYd|vsJd|�d����t3����t3|��}t3|��}|d}||j_||j_t�d��d ����dkr^dt8t:f����fd� }d�t)���D��}t=||dd� ��d!�|D��|j_n[d"\}} �� ��!d#|d$����!d%| d$�����!|d&�����jj"�|�j_"dSdS)(Nr�r�r�rr�r�r�r]r�r�r�r�r�r�r�)r�rHrIrrrc3�K�|]}|gV��dSrfrRrs rTrkz3ArrowBasedBuilder._prepare_split.<locals>.<genexpr>�rrSc�$��g|] \}}||d����� Sr rRr s �rTr"z4ArrowBasedBuilder._prepare_split.<locals>.<listcomp>�r rSr rrrr�shard_id_and_jobc�"��|\}}t�d|���|z}����d|d����d|d�����d|d����d�d�����dSrr)rirrrr�rarrs ����rTr z7ArrowBasedBuilder._prepare_split.<locals>._rename_shard s����#3� ��&�"%�n�W�f�W�&=�">�">��"I��� � ��M�M�'�h�+<�+<�=�=�E�E�g�RX���_�_��M�M�-�O�1I�1I�J�J�R�R�SZ�_k�\q�\q�r�r�����rSc�@�g|]\}}t|��D]}||f���SrRr"r$s rTr"z4ArrowBasedBuilder._prepare_split.<locals>.<listcomp>sP��"�"�"�&�F�J� %�j� 1� 1�"�"���6�"�"�"�"�"rSTr&r'c��g|] }|D]}|��� SrRrRr+s rTr"z4ArrowBasedBuilder._prepare_split.<locals>.<listcomp>r.rSr/rrrrMrR)#rBrr0r�rwrXrSr�r�r�r�r�rIrr�r�r1r2r3r�r4rJr�r rDrvr_r�r�r{rr,rCr�r�) rar�rHrJrIr�r5r6r7r8r9rrr:r;r<r=r>r?r@rArBrCrDr�r �shard_ids_and_jobsrr r�rrs ` @@@@rTr�z ArrowBasedBuilder._prepare_split�s��������2�.�2Y�F�DY�Z�Z�� 4���)�/�*>�?�J�J��� 4� 4� 4�(�3�J�J�J� 4����)���$�S�S��';�S�V�S�S�k�S�S����t�/��7�7�� � ,��1� � �>��?Y�Z�Z� ��1�$�$����\�X�\�\�*�/�\�\�\�������!�H�,�,����]�X�]�]�;K�]�]�V`�Ve�]�]�CS�]�]�]����,�����)�6�z��6�6�6� � � ���&�,� � �� � �x�1�}�}��F�(�3�J��F�� -� -�-G�T�-G�.�)�&�.�.�<O�.�.�-�-�)�F�D�'��-�!(���� � �G�,�,�,�,� -� -� -� -� -� -� -� -� -� -� -� -���� -� -� -� -��%�%�'V�%�%�%�h�h�#)�h�h�h� d� �m�-=�~�Od�Od�����*3�%�o�&@�x�X�X�X�+�+����N� �>�*�*�H� $�v��0� �!�F�X�-�M� $�v��0� �"�V�h�.�N�%)�F�X�$5� !��h��� 1�4��1�1�1C��d�8�.�2�2�2�1�1�-���g� � 1�!(� � 0�� 8� -�f� 5� 0�� 8� .�v� 6� 5�f� =� =�!�K�K��0�0�0�0�1�1�1�1�1�1�1�1�1�1�1�1����1�1�1�1� 1� 1� 1� 1� 1� 1� 1� 1� 1� 1� 1���� 1� 1� 1� 1�$�/�/�/�/�h�N^�h�h�h�0�/�/��>�*�*� � �!1�2�2���m�,�,��#�A�&��2D��"�/�/>��"�,� � � �7��7�7�7�8�8�8� �!� � � ��c� � � � � � � � � � �"�"�*3�N�*C�*C�"�"�"� � �}�&8�$�TV� W� W� W� W�8�8�2G�8�8�8�O� &� 4� 4� $� �H�f� �L�L�� � �g�(�'8�'8�9�9�A�A�'�f�?�?�[�[�� � �f�b�)�)� � � � �9� � %�!)�D�I� � � � &� %sX�:�A� A�=2E<�<F�F�J�AI/�# J�/I3 �3J�6I3 �7J�J � J rr�rrsc #�nK�d�|���D��}|jdi|��}|dkrtnt}|dk}g} d\} } d} d} ||jj|�d| d����d|d���|j|jj |���} tj ��}|D�]T\}}|��|j |kr�|� ��\}}|� ��| �|��| |z } | |z } | d z } ||j|�d| d����d|d���|j|jj |���} |�|��n9#t"$r,}t%j||jj||j� ���d}~wwxYw| t-|��z } tj ��|t.jzkrtj ��}|d | fV�d} ��V |d | fV�| d z}|� ��\}}|� ��| �|��| |z } | |z } n[#|d | fV�| d z}|� ��\}}|� ��| �|��| |z } | |z } wxYwn[#t2$rN}t5|t6��r|j�|j}t5|t:��r�t;d ��|�d}~wwxYw|d | | |j|| ffV�dS)Nc�b�i|],\}}|t|t��rt|��n|��-SrR)r_rrKr�s rTrwz;ArrowBasedBuilder._prepare_split_single.<locals>.<dictcomp>,s7��f�f�f�4�1�a�a�J�q�$�,?�,?�F��a����Q�f�f�frSrLr/rrrr)r�r�r�r�rIr)� cast_errorr�rr�FrJTrR)r�rerrr�r�r�r�r�r�rKrLrMrNr0rO� write_tabler8r"�from_cast_errorr�r�r�rrRrSr_rrTr#)rarr�rHrIrrUrVrIr,rCrDrrWrXrYr��tabler2r_rpr%r�s rTr3z'ArrowBasedBuilder._prepare_split_single)s�����g�f�S]�Sc�Sc�Se�Se�f�f�f� �)�D�)�7�7�J�7�7� �(3�y�(@�(@�}�}�k� �'�9�4��� �.2�+��O���'(�$�5 b�!�\���+��]�]�7�x�,=�,=�>�>�F�F�w�SY�P_�P_�`�`�"&�"9� $�� 8�"3� ���F�& -�� � � �� )�9�9�H�A�u�%�1�f�6G�.�6X�6X�28�/�/�2C�2C�/� �i�� � ����%�,�,�\�:�:�:�*�l�:�*�'�9�4�� �A� ��!-��%+�%5�!&���w�8�8I�8I�!J�!J�!R�!R�SZ�_e�\k�\k�!l�!l�.2�.E�,0�H�,D�.?� "�"�"����*�*�5�1�1�1�1��$����8�H�'1�)-��)?�'1�"&�*� ������������1�C��J�J�>�0��y�{�{�U�V�-N�%N�N�N� $� � � ��$�e�-I�I�I�I�I�78�4��99�<�e�%A�A�A�A�A�%��\� �*0�/�/�*;�*;�'� �i�� � �����$�$�\�2�2�2�"�l�2�"��9�,���� �e�%A�A�A�A�A�%��\� �*0�/�/�*;�*;�'� �i�� � �����$�$�\�2�2�2�"�l�2�"��9�,��������� b� b� b��!�1�2�2� "�q�}�7P��M���!�3�4�4� ��(�)Y�Z�Z�`a� a�����  b�����d�/��&�BR�T^�`m�n�n�n�n�n�n�nsZ�AK �*CI-�2F�I-� F>�'F9�9F>�>AI-�AK �-AK�K � L!�A L�L!c�8�t|j|j���S)N)r�)r-rerr�s rTr�z2ArrowBasedBuilder._get_examples_iterable_for_splitns��$�T�%:�?�C]�^�^�^�^rSr�)rOrPrQr�r�r�rer5ryr r{r r�r�rr�rzr3r.r�rRrSrTrcrc}s;������k�k���$�$���$�@#�"&�48� I*�I*�'�I*��I*��3�-� I*� !��s�C�x��1� I*�I*�I*�I*�VCo��Co�'*�Co�9<�Co�NQ�Co�[^�Co� �%��T�5��e��#4�4�5� 6�Co�Co�Co�Co�J_��_�Sc�_�_�_�_�_�_rSrc)�r�r�rjr�r�r�r�rWr�rKr��collections.abcrr� dataclassesr� functoolsr�pathlibr�typingrr r � unittest.mockr r�� fsspec.corer � multiprocessr �tqdm.contrib.concurrentrrMrr� arrow_datasetr� arrow_readerrr� arrow_writerrrrr\rrr� dataset_dictrr�download.download_configr�download.download_managerrr�#download.streaming_download_managerr r!� exceptionsr"r#r$r%r�r&� filesystemsr'r(r�r)r�r*r+r,�iterable_datasetr-r.r/�keyhashr0�namingr1r2rwr3r4r5r6� streamingr7rsr8r9r:r1�utils._filelockr;�utils.file_utilsr<�utils.info_utilsr=r>r?r@�utils.py_utilsrArBrCrDrErFrGrH�utils.shardingrIrJ� utils.trackrK�loadrL� get_loggerrOr�r`rNrVr�r�rcrRrSrT�<module>r�s��� !� � � � � ����� � � � ����� � � � ����� � � � ����� � � � � � � � �-�-�-�-�-�-�-�-�!�!�!�!�!�!�������������1�1�1�1�1�1�1�1�1�1������� � � � �!�!�!�!�!�!�������.�.�.�.�.�.���������"�"�"�"�"�"���������K�J�J�J�J�J�J�J�J�J�O�O�O�O�O�O�O�O�O�O�:�:�:�:�:�:�:�:�4�4�4�4�4�4�D�D�D�D�D�D�D�D�P�P�P�P�P�P�P�P�p�p�p�p�p�p�p�p�p�p�p�p��������������� ������B�B�B�B�B�B�B�B�B�B�V�V�V�V�V�V�V�V�V�V�(�(�(�(�(�(�N�N�N�N�N�N�N�N�?�?�?�?�?�?�?�?�?�?�?�?�;�;�;�;�;�;�������������"�"�"�"�"�"�%�%�%�%�%�%�+�+�+�+�+�+�g�g�g�g�g�g�g�g�g�g�g�g� � � � � � � � � � � � � � � � � � � � �O�N�N�N�N�N�N�N�%�%�%�%�%�%��$�#�#�#�#�#�#� �� �H� %� %�� � � � � � � � � � �nR�nR�nR�nR�nR�nR�nR� ��nR�b_$�_$�_$�_$�_$�_$�_$�_$�D%FU�FU�FU�FU�FU�N�FU�FU�FU�Rr_�r_�r_�r_�r_��r_�r_�r_�r_�r_rS
Memory