from __future__ import annotations
import re
from html import unescape
from typing import Any
from urllib.parse import unquote
from .exceptions import DuckDuckGoSearchException
try:
HAS_ORJSON = True
import orjson
except ImportError:
HAS_ORJSON = False
import json
REGEX_STRIP_TAGS = re.compile("<.*?>")
def json_dumps(obj: Any) -> str:
try:
return (
orjson.dumps(obj, option=orjson.OPT_INDENT_2).decode()
if HAS_ORJSON
else json.dumps(obj, ensure_ascii=False, indent=2)
)
except Exception as ex:
raise DuckDuckGoSearchException(f"{type(ex).__name__}: {ex}") from ex
def json_loads(obj: str | bytes) -> Any:
try:
return orjson.loads(obj) if HAS_ORJSON else json.loads(obj)
except Exception as ex:
raise DuckDuckGoSearchException(f"{type(ex).__name__}: {ex}") from ex
def _extract_vqd(html_bytes: bytes, keywords: str) -> str:
"""Extract vqd from html bytes."""
for c1, c1_len, c2 in (
(b'vqd="', 5, b'"'),
(b"vqd=", 4, b"&"),
(b"vqd='", 5, b"'"),
):
try:
start = html_bytes.index(c1) + c1_len
end = html_bytes.index(c2, start)
return html_bytes[start:end].decode()
except ValueError:
pass
raise DuckDuckGoSearchException(f"_extract_vqd() {keywords=} Could not extract vqd.")
def _normalize(raw_html: str) -> str:
"""Strip HTML tags from the raw_html string."""
return unescape(REGEX_STRIP_TAGS.sub("", raw_html)) if raw_html else ""
def _normalize_url(url: str) -> str:
"""Unquote URL and replace spaces with '+'."""
return unquote(url).replace(" ", "+") if url else ""
def _expand_proxy_tb_alias(proxy: str | None) -> str | None:
"""Expand "tb" to a full proxy URL if applicable."""
return "socks5://127.0.0.1:9150" if proxy == "tb" else proxy