"""Provides `partition_json()`. Note this does not partition arbitrary JSON. Its only use-case is to "rehydrate" unstructured document elements serialized to JSON, essentially the same function as `elements_from_json()`, but this allows a document of already-partitioned elements to be combined transparently with other documents in a partitioning run. It also allows multiple (low-cost) chunking runs to be performed on a document while only incurring partitioning cost once. """ from __future__ import annotations import json from typing import IO, Any, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata from unstructured.file_utils.filetype import ( FileType, add_metadata_with_filetype, is_json_processable, ) from unstructured.partition.common.common import exactly_one from unstructured.partition.common.metadata import get_last_modified_date from unstructured.staging.base import elements_from_dicts @process_metadata() @add_metadata_with_filetype(FileType.JSON) @add_chunking_strategy def partition_json( filename: Optional[str] = None, file: Optional[IO[bytes]] = None, text: Optional[str] = None, metadata_last_modified: Optional[str] = None, **kwargs: Any, ) -> list[Element]: """Partitions serialized Unstructured output into its constituent elements. Parameters ---------- filename A string defining the target filename path. file A file-like object as bytes --> open(filename, "rb"). text The string representation of the .json document. metadata_last_modified The last modified date for the document. """ if text is not None and text.strip() == "" and not file and not filename: return [] exactly_one(filename=filename, file=file, text=text) last_modified = get_last_modified_date(filename) if filename else None file_text = "" if filename is not None: with open(filename, encoding="utf8") as f: file_text = f.read() elif file is not None: file_content = file.read() file_text = file_content if isinstance(file_content, str) else file_content.decode() file.seek(0) elif text is not None: file_text = str(text) if not is_json_processable(file_text=file_text): raise ValueError( "JSON cannot be partitioned. Schema does not match the Unstructured schema.", ) try: element_dicts = json.loads(file_text) elements = elements_from_dicts(element_dicts) except json.JSONDecodeError: raise ValueError("Not a valid json") for element in elements: element.metadata.last_modified = metadata_last_modified or last_modified return elements
Memory