from __future__ import annotations import logging import os import warnings from datetime import datetime, timezone from functools import cached_property from itertools import cycle from random import choice, shuffle from time import sleep, time from types import TracebackType from typing import Any, Literal import primp from lxml.etree import _Element from lxml.html import HTMLParser as LHTMLParser from lxml.html import document_fromstring from .exceptions import DuckDuckGoSearchException, RatelimitException, TimeoutException from .utils import ( _expand_proxy_tb_alias, _extract_vqd, _normalize, _normalize_url, json_loads, ) logger = logging.getLogger("duckduckgo_search.DDGS") class DDGS: """DuckDuckgo_search class to get search results from duckduckgo.com.""" _impersonates = ( "chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107", "chrome_108", "chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119", "chrome_120", "chrome_123", "chrome_124", "chrome_126", "chrome_127", "chrome_128", "chrome_129", "chrome_130", "chrome_131", "chrome_133", "safari_ios_16.5", "safari_ios_17.2", "safari_ios_17.4.1", "safari_ios_18.1.1", "safari_15.3", "safari_15.5", "safari_15.6.1", "safari_16", "safari_16.5", "safari_17.0", "safari_17.2.1", "safari_17.4.1", "safari_17.5", "safari_18", "safari_18.2", "safari_ipad_18", "edge_101", "edge_122", "edge_127", "edge_131", "firefox_109", "firefox_117", "firefox_128", "firefox_133", "firefox_135", ) # fmt: skip _impersonates_os = ("android", "ios", "linux", "macos", "windows") _chat_models = { "gpt-4o-mini": "gpt-4o-mini", "llama-3.3-70b": "meta-llama/Llama-3.3-70B-Instruct-Turbo", "claude-3-haiku": "claude-3-haiku-20240307", "o3-mini": "o3-mini", "mistral-small-3": "mistralai/Mistral-Small-24B-Instruct-2501", } def __init__( self, headers: dict[str, str] | None = None, proxy: str | None = None, proxies: dict[str, str] | str | None = None, # deprecated timeout: int | None = 10, verify: bool = True, ) -> None: """Initialize the DDGS object. Args: headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None. proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols. example: "http://user:pass@example.com:3128". Defaults to None. timeout (int, optional): Timeout value for the HTTP client. Defaults to 10. verify (bool): SSL verification when making the request. Defaults to True. """ ddgs_proxy: str | None = os.environ.get("DDGS_PROXY") self.proxy: str | None = ddgs_proxy if ddgs_proxy else _expand_proxy_tb_alias(proxy) assert self.proxy is None or isinstance(self.proxy, str), "proxy must be a str" if not proxy and proxies: warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1) self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies self.headers = headers if headers else {} self.headers["Referer"] = "https://duckduckgo.com/" self.timeout = timeout self.client = primp.Client( # headers=self.headers, proxy=self.proxy, timeout=self.timeout, cookie_store=True, referer=True, impersonate=choice(self._impersonates), # type: ignore impersonate_os=choice(self._impersonates_os), # type: ignore follow_redirects=False, verify=verify, ) self._chat_messages: list[dict[str, str]] = [] self._chat_tokens_count = 0 self._chat_vqd: str = "" self._chat_vqd_hash: str = "" self._chat_xfe: str = "" self.sleep_timestamp = 0.0 def __enter__(self) -> DDGS: return self def __exit__( self, exc_type: type[BaseException] | None = None, exc_val: BaseException | None = None, exc_tb: TracebackType | None = None, ) -> None: pass @cached_property def parser(self) -> LHTMLParser: """Get HTML parser.""" return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False) def _sleep(self, sleeptime: float = 0.75) -> None: """Sleep between API requests.""" delay = 0.0 if not self.sleep_timestamp else 0.0 if time() - self.sleep_timestamp >= 20 else sleeptime self.sleep_timestamp = time() sleep(delay) def _get_url( self, method: Literal["GET", "HEAD", "OPTIONS", "DELETE", "POST", "PUT", "PATCH"], url: str, params: dict[str, str] | None = None, content: bytes | None = None, data: dict[str, str] | None = None, headers: dict[str, str] | None = None, cookies: dict[str, str] | None = None, json: Any = None, timeout: float | None = None, ) -> Any: self._sleep() try: resp = self.client.request( method, url, params=params, content=content, data=data, headers=headers, cookies=cookies, json=json, timeout=timeout or self.timeout, ) except Exception as ex: if "time" in str(ex).lower(): raise TimeoutException(f"{url} {type(ex).__name__}: {ex}") from ex raise DuckDuckGoSearchException(f"{url} {type(ex).__name__}: {ex}") from ex logger.debug(f"_get_url() {resp.url} {resp.status_code}") if resp.status_code == 200: return resp elif resp.status_code in (202, 301, 403, 400, 429, 418): raise RatelimitException(f"{resp.url} {resp.status_code} Ratelimit") raise DuckDuckGoSearchException(f"{resp.url} return None. {params=} {content=} {data=}") def _get_vqd(self, keywords: str) -> str: """Get vqd value for a search query.""" resp_content = self._get_url("GET", "https://duckduckgo.com", params={"q": keywords}).content return _extract_vqd(resp_content, keywords) def text( self, keywords: str, region: str = "wt-wt", safesearch: str = "moderate", timelimit: str | None = None, backend: str = "auto", max_results: int | None = None, ) -> list[dict[str, str]]: """DuckDuckGo text search. Query params: https://duckduckgo.com/params. Args: keywords: keywords for query. region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". safesearch: on, moderate, off. Defaults to "moderate". timelimit: d, w, m, y. Defaults to None. backend: auto, html, lite. Defaults to auto. auto - try all backends in random order, html - collect data from https://html.duckduckgo.com, lite - collect data from https://lite.duckduckgo.com. max_results: max number of results. If None, returns results only from the first response. Defaults to None. Returns: List of dictionaries with search results, or None if there was an error. Raises: DuckDuckGoSearchException: Base exception for duckduckgo_search errors. RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits. TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts. """ if backend in ("api", "ecosia"): warnings.warn(f"{backend=} is deprecated, using backend='auto'", stacklevel=2) backend = "auto" backends = ["html", "lite"] if backend == "auto" else [backend] shuffle(backends) results, err = [], None for b in backends: try: if b == "html": results = self._text_html(keywords, region, timelimit, max_results) elif b == "lite": results = self._text_lite(keywords, region, timelimit, max_results) return results except Exception as ex: logger.info(f"Error to search using {b} backend: {ex}") err = ex raise DuckDuckGoSearchException(err) def _text_html( self, keywords: str, region: str = "wt-wt", timelimit: str | None = None, max_results: int | None = None, ) -> list[dict[str, str]]: assert keywords, "keywords is mandatory" payload = { "q": keywords, "b": "", "kl": region, } if timelimit: payload["df"] = timelimit cache = set() results: list[dict[str, str]] = [] for _ in range(5): resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload).content if b"No results." in resp_content: return results tree = document_fromstring(resp_content, self.parser) elements = tree.xpath("//div[h2]") if not isinstance(elements, list): return results for e in elements: if isinstance(e, _Element): hrefxpath = e.xpath("./a/@href") href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, list) else None if ( href and href not in cache and not href.startswith( ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain") ) ): cache.add(href) titlexpath = e.xpath("./h2/a/text()") title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, list) else "" bodyxpath = e.xpath("./a//text()") body = "".join(str(x) for x in bodyxpath) if bodyxpath and isinstance(bodyxpath, list) else "" results.append( { "title": _normalize(title), "href": _normalize_url(href), "body": _normalize(body), } ) if max_results and len(results) >= max_results: return results npx = tree.xpath('.//div[@class="nav-link"]') if not npx or not max_results: return results next_page = npx[-1] if isinstance(npx, list) else None if isinstance(next_page, _Element): names = next_page.xpath('.//input[@type="hidden"]/@name') values = next_page.xpath('.//input[@type="hidden"]/@value') if isinstance(names, list) and isinstance(values, list): payload = {str(n): str(v) for n, v in zip(names, values)} return results def _text_lite( self, keywords: str, region: str = "wt-wt", timelimit: str | None = None, max_results: int | None = None, ) -> list[dict[str, str]]: assert keywords, "keywords is mandatory" payload = { "q": keywords, "kl": region, } if timelimit: payload["df"] = timelimit cache = set() results: list[dict[str, str]] = [] for _ in range(5): resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload).content if b"No more results." in resp_content: return results tree = document_fromstring(resp_content, self.parser) elements = tree.xpath("//table[last()]//tr") if not isinstance(elements, list): return results data = zip(cycle(range(1, 5)), elements) for i, e in data: if isinstance(e, _Element): if i == 1: hrefxpath = e.xpath(".//a//@href") href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, list) else None if ( href is None or href in cache or href.startswith( ("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain") ) ): [next(data, None) for _ in range(3)] # skip block(i=1,2,3,4) else: cache.add(href) titlexpath = e.xpath(".//a//text()") title = str(titlexpath[0]) if titlexpath and isinstance(titlexpath, list) else "" elif i == 2: bodyxpath = e.xpath(".//td[@class='result-snippet']//text()") body = ( "".join(str(x) for x in bodyxpath).strip() if bodyxpath and isinstance(bodyxpath, list) else "" ) if href: results.append( { "title": _normalize(title), "href": _normalize_url(href), "body": _normalize(body), } ) if max_results and len(results) >= max_results: return results npx = tree.xpath("//form[./input[contains(@value, 'ext')]]") if not npx or not max_results: return results next_page = npx[-1] if isinstance(npx, list) else None if isinstance(next_page, _Element): names = next_page.xpath('.//input[@type="hidden"]/@name') values = next_page.xpath('.//input[@type="hidden"]/@value') if isinstance(names, list) and isinstance(values, list): payload = {str(n): str(v) for n, v in zip(names, values)} return results def images( self, keywords: str, region: str = "wt-wt", safesearch: str = "moderate", timelimit: str | None = None, size: str | None = None, color: str | None = None, type_image: str | None = None, layout: str | None = None, license_image: str | None = None, max_results: int | None = None, ) -> list[dict[str, str]]: """DuckDuckGo images search. Query params: https://duckduckgo.com/params. Args: keywords: keywords for query. region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". safesearch: on, moderate, off. Defaults to "moderate". timelimit: Day, Week, Month, Year. Defaults to None. size: Small, Medium, Large, Wallpaper. Defaults to None. color: color, Monochrome, Red, Orange, Yellow, Green, Blue, Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None. type_image: photo, clipart, gif, transparent, line. Defaults to None. layout: Square, Tall, Wide. Defaults to None. license_image: any (All Creative Commons), Public (PublicDomain), Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially), Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and Use Commercially). Defaults to None. max_results: max number of results. If None, returns results only from the first response. Defaults to None. Returns: List of dictionaries with images search results. Raises: DuckDuckGoSearchException: Base exception for duckduckgo_search errors. RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits. TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts. """ assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) safesearch_base = {"on": "1", "moderate": "1", "off": "-1"} timelimit = f"time:{timelimit}" if timelimit else "" size = f"size:{size}" if size else "" color = f"color:{color}" if color else "" type_image = f"type:{type_image}" if type_image else "" layout = f"layout:{layout}" if layout else "" license_image = f"license:{license_image}" if license_image else "" payload = { "l": region, "o": "json", "q": keywords, "vqd": vqd, "f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}", "p": safesearch_base[safesearch.lower()], } cache = set() results: list[dict[str, str]] = [] for _ in range(5): resp_content = self._get_url( "GET", "https://duckduckgo.com/i.js", params=payload, headers={"Referer": "https://duckduckgo.com/"} ).content resp_json = json_loads(resp_content) page_data = resp_json.get("results", []) for row in page_data: image_url = row.get("image") if image_url and image_url not in cache: cache.add(image_url) result = { "title": row["title"], "image": _normalize_url(image_url), "thumbnail": _normalize_url(row["thumbnail"]), "url": _normalize_url(row["url"]), "height": row["height"], "width": row["width"], "source": row["source"], } results.append(result) if max_results and len(results) >= max_results: return results next = resp_json.get("next") if next is None or not max_results: return results payload["s"] = next.split("s=")[-1].split("&")[0] return results def videos( self, keywords: str, region: str = "wt-wt", safesearch: str = "moderate", timelimit: str | None = None, resolution: str | None = None, duration: str | None = None, license_videos: str | None = None, max_results: int | None = None, ) -> list[dict[str, str]]: """DuckDuckGo videos search. Query params: https://duckduckgo.com/params. Args: keywords: keywords for query. region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". safesearch: on, moderate, off. Defaults to "moderate". timelimit: d, w, m. Defaults to None. resolution: high, standart. Defaults to None. duration: short, medium, long. Defaults to None. license_videos: creativeCommon, youtube. Defaults to None. max_results: max number of results. If None, returns results only from the first response. Defaults to None. Returns: List of dictionaries with videos search results. Raises: DuckDuckGoSearchException: Base exception for duckduckgo_search errors. RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits. TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts. """ assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"} timelimit = f"publishedAfter:{timelimit}" if timelimit else "" resolution = f"videoDefinition:{resolution}" if resolution else "" duration = f"videoDuration:{duration}" if duration else "" license_videos = f"videoLicense:{license_videos}" if license_videos else "" payload = { "l": region, "o": "json", "q": keywords, "vqd": vqd, "f": f"{timelimit},{resolution},{duration},{license_videos}", "p": safesearch_base[safesearch.lower()], } cache = set() results: list[dict[str, str]] = [] for _ in range(8): resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload).content resp_json = json_loads(resp_content) page_data = resp_json.get("results", []) for row in page_data: if row["content"] not in cache: cache.add(row["content"]) results.append(row) if max_results and len(results) >= max_results: return results next = resp_json.get("next") if next is None or not max_results: return results payload["s"] = next.split("s=")[-1].split("&")[0] return results def news( self, keywords: str, region: str = "wt-wt", safesearch: str = "moderate", timelimit: str | None = None, max_results: int | None = None, ) -> list[dict[str, str]]: """DuckDuckGo news search. Query params: https://duckduckgo.com/params. Args: keywords: keywords for query. region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt". safesearch: on, moderate, off. Defaults to "moderate". timelimit: d, w, m. Defaults to None. max_results: max number of results. If None, returns results only from the first response. Defaults to None. Returns: List of dictionaries with news search results. Raises: DuckDuckGoSearchException: Base exception for duckduckgo_search errors. RatelimitException: Inherits from DuckDuckGoSearchException, raised for exceeding API request rate limits. TimeoutException: Inherits from DuckDuckGoSearchException, raised for API request timeouts. """ assert keywords, "keywords is mandatory" vqd = self._get_vqd(keywords) safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"} payload = { "l": region, "o": "json", "noamp": "1", "q": keywords, "vqd": vqd, "p": safesearch_base[safesearch.lower()], } if timelimit: payload["df"] = timelimit cache = set() results: list[dict[str, str]] = [] for _ in range(5): resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload).content resp_json = json_loads(resp_content) page_data = resp_json.get("results", []) for row in page_data: if row["url"] not in cache: cache.add(row["url"]) image_url = row.get("image", None) result = { "date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(), "title": row["title"], "body": _normalize(row["excerpt"]), "url": _normalize_url(row["url"]), "image": _normalize_url(image_url), "source": row["source"], } results.append(result) if max_results and len(results) >= max_results: return results next = resp_json.get("next") if next is None or not max_results: return results payload["s"] = next.split("s=")[-1].split("&")[0] return results