import os from enum import Enum class Source(Enum): PDFMINER = "pdfminer" OCR_TESSERACT = "ocr_tesseract" OCR_PADDLE = "ocr_paddle" OCR_GOOGLEVISION = "ocr_googlevision" class OCRMode(Enum): INDIVIDUAL_BLOCKS = "individual_blocks" FULL_PAGE = "entire_page" class PartitionStrategy: AUTO = "auto" FAST = "fast" OCR_ONLY = "ocr_only" HI_RES = "hi_res" SORT_MODE_XY_CUT = "xy-cut" SORT_MODE_BASIC = "basic" SORT_MODE_DONT = "dont" OCR_AGENT_TESSERACT_OLD = "tesseract" OCR_AGENT_PADDLE_OLD = "paddle" OCR_AGENT_TESSERACT = "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract" OCR_AGENT_PADDLE = "unstructured.partition.utils.ocr_models.paddle_ocr.OCRAgentPaddle" OCR_AGENT_GOOGLEVISION = ( "unstructured.partition.utils.ocr_models.google_vision_ocr.OCRAgentGoogleVision" ) OCR_AGENT_MODULES_WHITELIST = os.getenv( "OCR_AGENT_MODULES_WHITELIST", "unstructured.partition.utils.ocr_models.tesseract_ocr," "unstructured.partition.utils.ocr_models.paddle_ocr," "unstructured.partition.utils.ocr_models.google_vision_ocr", ).split(",") UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False) # this field is defined by unstructured_pytesseract TESSERACT_TEXT_HEIGHT = "height" TESSERACT_LANGUAGES_SPLITTER = "+" # source: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html # All languages have been changed to lowercase and have been altered to remove dates and "(contrib)" # Ex: "Greek, Ancient (to 1453) (contrib)" -> "greek, ancient" # Where it seemed appropriate, languages have been split into multiple keys with the same value. # Ex: "greek, modern":"ell", "greek":"ell", "chinese - simplified":"chi_sim", "chinese":"chi_sim", # On tesseract-ocr.github.io, "Spanish" matches with both "spa_old" and "spa". # Here, it only matches with "spa" and "spanish - old":"spa_old" was added. TESSERACT_LANGUAGES_AND_CODES = { "afrikaans": "afr", "amharic": "amh", "arabic": "ara", "assamese": "asm", "azerbaijani": "aze", "azerbaijani - cyrilic": "aze_cyrl", "belarusian": "bel", "bengali": "ben", "tibetan": "bod", "bosnian": "bos", "breton": "bre", "bulgarian": "bul", "catalan; Valencian": "cat", "cebuano": "ceb", "czech": "ces", "chinese - simplified": "chi_sim", "chinese": "chi_sim", "chinese - traditional": "chi_tra", "cherokee": "chr", "corsican": "cos", "welsh": "cym", "danish": "dan", "danish - fraktur": "dan_frak", "german": "deu", "german - fraktur (contrib)": "deu_frak", # "contrib" not removed because it would repeat key "dzongkha": "dzo", "greek, modern": "ell", "greek": "ell", "english": "eng", "english, middle": "enm", "esperanto": "epo", "math / equation detection module": "equ", "estonian": "est", "basque": "eus", "faroese": "fao", "persian": "fas", "filipino (old - tagalog)": "fil", "filipino": "fil", "finnish": "fin", "french": "fra", "german - fraktur": "frk", "french, middle": "frm", "western frisian": "fry", "scottish gaelic": "gla", "irish": "gle", "galician": "glg", "greek, ancient": "grc", "gujarati": "guj", "haitian": "hat", "haitian creole": "hat", "hebrew": "heb", "hindi": "hin", "croatian": "hrv", "hungarian": "hun", "armenian": "hye", "inuktitut": "iku", "indonesian": "ind", "icelandic": "isl", "italian": "ita", "italian - old": "ita_old", "javanese": "jav", "japanese": "jpn", "kannada": "kan", "georgian": "kat", "georgian - old": "kat_old", "kazakh": "kaz", "central khmer": "khm", "kirghiz": "kir", "kyrgyz": "kir", "kurmanji (kurdish - latin script)": "kmr", "korean": "kor", "korean (vertical)": "kor_vert", "kurdish (arabic script)": "kur", "lao": "lao", "latin": "lat", "latvian": "lav", "lithuanian": "lit", "luxembourgish": "ltz", "malayalam": "mal", "marathi": "mar", "macedonian": "mkd", "maltese": "mlt", "mongolian": "mon", "maori": "mri", "malay": "msa", "burmese": "mya", "nepali": "nep", "dutch": "nld", "flemish": "nld", "norwegian": "nor", "occitan": "oci", "oriya": "ori", "orientation and script detection module": "osd", "panjabi": "pan", "punjabi": "pan", "polish": "pol", "portuguese": "por", "pushto": "pus", "pashto": "pus", "quechua": "que", "romanian": "ron", "moldavian": "ron", "moldovan": "ron", "russian": "rus", "sanskrit": "san", "sinhala": "sin", "sinhalese": "sin", "slovak": "slk", "slovak - fraktur": "slk_frak", "slovenian": "slv", "sindhi": "snd", "spanish": "spa", "castilian": "spa", "spanish - old": "spa_old", "castilian - old": "spa_old", "albanian": "sqi", "serbian": "srp", "serbian - latin": "srp_latn", "sundanese": "sun", "swahili": "swa", "swedish": "swe", "syriac": "syr", "tamil": "tam", "tatar": "tat", "telugu": "tel", "tajik": "tgk", "tagalog": "tgl", "thai": "tha", "tigrinya": "tir", "tonga": "ton", "turkish": "tur", "uighur": "uig", "uyghur": "uig", "ukrainian": "ukr", "urdu": "urd", "uzbek": "uzb", "uzbek - cyrilic": "uzb_cyrl", "vietnamese": "vie", "yiddish": "yid", "yoruba": "yor", } # 2 ** 31 - 1, max byte size for image data TESSERACT_MAX_SIZE = 2147483647 # default image colors IMAGE_COLOR_DEPTH = 32 HTML_MAX_PREDECESSOR_LEN = 15