|
import logging
|
|
import warnings
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from datetime import datetime, timezone
|
|
from decimal import Decimal
|
|
from functools import cached_property
|
|
from itertools import cycle, islice
|
|
from threading import Event
|
|
from types import TracebackType
|
|
from typing import Dict, List, Optional, Tuple, Type, Union, cast
|
|
|
|
import pyreqwest_impersonate as pri
|
|
|
|
try:
|
|
from lxml.etree import _Element
|
|
from lxml.html import HTMLParser as LHTMLParser
|
|
from lxml.html import document_fromstring
|
|
|
|
LXML_AVAILABLE = True
|
|
except ImportError:
|
|
LXML_AVAILABLE = False
|
|
|
|
from .exceptions import WebscoutE, RatelimitE, TimeoutE
|
|
from .utils import (
|
|
_calculate_distance,
|
|
_extract_vqd,
|
|
_normalize,
|
|
_normalize_url,
|
|
_text_extract_json,
|
|
json_loads,
|
|
)
|
|
|
|
logger = logging.getLogger("webcout_search.WEBS")
|
|
|
|
|
|
class WEBS:
|
|
"""webcout_search class to get search results from duckduckgo.com."""
|
|
|
|
_executor: ThreadPoolExecutor = ThreadPoolExecutor()
|
|
|
|
def __init__(
|
|
self,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
proxy: Optional[str] = None,
|
|
proxies: Union[Dict[str, str], str, None] = None,
|
|
timeout: Optional[int] = 10,
|
|
) -> None:
|
|
"""Initialize the WEBS object.
|
|
|
|
Args:
|
|
headers (dict, optional): Dictionary of headers for the HTTP client. Defaults to None.
|
|
proxy (str, optional): proxy for the HTTP client, supports http/https/socks5 protocols.
|
|
example: "http://user:[email protected]:3128". Defaults to None.
|
|
timeout (int, optional): Timeout value for the HTTP client. Defaults to 10.
|
|
"""
|
|
self.proxy: Optional[str] = proxy
|
|
assert self.proxy is None or isinstance(self.proxy, str), "proxy must be a str"
|
|
if not proxy and proxies:
|
|
warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
|
|
self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
|
|
self.headers = headers if headers else {}
|
|
self.headers["Referer"] = "https://duckduckgo.com/"
|
|
self.client = pri.Client(
|
|
headers=self.headers,
|
|
proxy=self.proxy,
|
|
timeout=timeout,
|
|
cookie_store=True,
|
|
referer=True,
|
|
impersonate="chrome_124",
|
|
follow_redirects=False,
|
|
verify=False,
|
|
)
|
|
self._exception_event = Event()
|
|
self._chat_messages: List[Dict[str, str]] = []
|
|
self._chat_vqd: str = ""
|
|
|
|
def __enter__(self) -> "WEBS":
|
|
return self
|
|
|
|
def __exit__(
|
|
self,
|
|
exc_type: Optional[Type[BaseException]] = None,
|
|
exc_val: Optional[BaseException] = None,
|
|
exc_tb: Optional[TracebackType] = None,
|
|
) -> None:
|
|
pass
|
|
|
|
@cached_property
|
|
def parser(self) -> "LHTMLParser":
|
|
"""Get HTML parser."""
|
|
return LHTMLParser(remove_blank_text=True, remove_comments=True, remove_pis=True, collect_ids=False)
|
|
|
|
def _get_url(
|
|
self,
|
|
method: str,
|
|
url: str,
|
|
params: Optional[Dict[str, str]] = None,
|
|
content: Optional[bytes] = None,
|
|
data: Optional[Union[Dict[str, str], bytes]] = None,
|
|
) -> bytes:
|
|
if self._exception_event.is_set():
|
|
raise WebscoutE("Exception occurred in previous call.")
|
|
try:
|
|
resp = self.client.request(method, url, params=params, content=content, data=data)
|
|
except Exception as ex:
|
|
self._exception_event.set()
|
|
if "time" in str(ex).lower():
|
|
raise TimeoutE(f"{url} {type(ex).__name__}: {ex}") from ex
|
|
raise WebscoutE(f"{url} {type(ex).__name__}: {ex}") from ex
|
|
logger.debug(f"_get_url() {resp.url} {resp.status_code} {len(resp.content)}")
|
|
if resp.status_code == 200:
|
|
return cast(bytes, resp.content)
|
|
self._exception_event.set()
|
|
if resp.status_code in (202, 301, 403):
|
|
raise RatelimitE(f"{resp.url} {resp.status_code} Ratelimit")
|
|
raise WebscoutE(f"{resp.url} return None. {params=} {content=} {data=}")
|
|
|
|
def _get_vqd(self, keywords: str) -> str:
|
|
"""Get vqd value for a search query."""
|
|
resp_content = self._get_url("POST", "https://duckduckgo.com", data={"q": keywords})
|
|
return _extract_vqd(resp_content, keywords)
|
|
|
|
def chat(self, keywords: str, model: str = "gpt-3.5") -> str:
|
|
"""Initiates a chat session with Webscout AI.
|
|
|
|
Args:
|
|
keywords (str): The initial message or question to send to the AI.
|
|
model (str): The model to use: "gpt-3.5", "claude-3-haiku". Defaults to "gpt-3.5".
|
|
|
|
Returns:
|
|
str: The response from the AI.
|
|
"""
|
|
models = {"claude-3-haiku": "claude-3-haiku-20240307", "gpt-3.5": "gpt-3.5-turbo-0125"}
|
|
|
|
if not self._chat_vqd:
|
|
resp = self.client.get("https://duckduckgo.com/duckchat/v1/status", headers={"x-vqd-accept": "1"})
|
|
self._chat_vqd = resp.headers.get("x-vqd-4", "")
|
|
|
|
self._chat_messages.append({"role": "user", "content": keywords})
|
|
|
|
json_data = {
|
|
"model": models[model],
|
|
"messages": self._chat_messages,
|
|
}
|
|
resp = self.client.post(
|
|
"https://duckduckgo.com/duckchat/v1/chat", headers={"x-vqd-4": self._chat_vqd}, json=json_data
|
|
)
|
|
self._chat_vqd = resp.headers.get("x-vqd-4", "")
|
|
|
|
messages = []
|
|
for line in resp.text.replace("data: ", "").replace("[DONE]", "").split("\n\n"):
|
|
x = line.strip()
|
|
if x:
|
|
j = json_loads(x)
|
|
message = j.get("message", "")
|
|
messages.append(message)
|
|
result = "".join(messages)
|
|
self._chat_messages.append({"role": "assistant", "content": result})
|
|
return result
|
|
|
|
def text(
|
|
self,
|
|
keywords: str,
|
|
region: str = "wt-wt",
|
|
safesearch: str = "moderate",
|
|
timelimit: Optional[str] = None,
|
|
backend: str = "api",
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout text search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
safesearch: on, moderate, off. Defaults to "moderate".
|
|
timelimit: d, w, m, y. Defaults to None.
|
|
backend: api, html, lite. Defaults to api.
|
|
api - collect data from https://duckduckgo.com,
|
|
html - collect data from https://html.duckduckgo.com,
|
|
lite - collect data from https://lite.duckduckgo.com.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with search results, or None if there was an error.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
if LXML_AVAILABLE is False and backend != "api":
|
|
backend = "api"
|
|
warnings.warn("lxml is not installed. Using backend='api'.", stacklevel=2)
|
|
|
|
if backend == "api":
|
|
results = self._text_api(keywords, region, safesearch, timelimit, max_results)
|
|
elif backend == "html":
|
|
results = self._text_html(keywords, region, safesearch, timelimit, max_results)
|
|
elif backend == "lite":
|
|
results = self._text_lite(keywords, region, timelimit, max_results)
|
|
return results
|
|
|
|
def _text_api(
|
|
self,
|
|
keywords: str,
|
|
region: str = "wt-wt",
|
|
safesearch: str = "moderate",
|
|
timelimit: Optional[str] = None,
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout text search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
safesearch: on, moderate, off. Defaults to "moderate".
|
|
timelimit: d, w, m, y. Defaults to None.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with search results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
vqd = self._get_vqd(keywords)
|
|
|
|
payload = {
|
|
"q": keywords,
|
|
"kl": region,
|
|
"l": region,
|
|
"p": "",
|
|
"s": "0",
|
|
"df": "",
|
|
"vqd": vqd,
|
|
"ex": "",
|
|
}
|
|
safesearch = safesearch.lower()
|
|
if safesearch == "moderate":
|
|
payload["ex"] = "-1"
|
|
elif safesearch == "off":
|
|
payload["ex"] = "-2"
|
|
elif safesearch == "on":
|
|
payload["p"] = "1"
|
|
if timelimit:
|
|
payload["df"] = timelimit
|
|
|
|
cache = set()
|
|
results: List[Dict[str, str]] = []
|
|
|
|
def _text_api_page(s: int) -> List[Dict[str, str]]:
|
|
payload["s"] = f"{s}"
|
|
resp_content = self._get_url("GET", "https://links.duckduckgo.com/d.js", params=payload)
|
|
page_data = _text_extract_json(resp_content, keywords)
|
|
page_results = []
|
|
for row in page_data:
|
|
href = row.get("u", None)
|
|
if href and href not in cache and href != f"http://www.google.com/search?q={keywords}":
|
|
cache.add(href)
|
|
body = _normalize(row["a"])
|
|
if body:
|
|
result = {
|
|
"title": _normalize(row["t"]),
|
|
"href": _normalize_url(href),
|
|
"body": body,
|
|
}
|
|
page_results.append(result)
|
|
return page_results
|
|
|
|
slist = [0]
|
|
if max_results:
|
|
max_results = min(max_results, 500)
|
|
slist.extend(range(23, max_results, 50))
|
|
try:
|
|
for r in self._executor.map(_text_api_page, slist):
|
|
results.extend(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
return list(islice(results, max_results))
|
|
|
|
def _text_html(
|
|
self,
|
|
keywords: str,
|
|
region: str = "wt-wt",
|
|
safesearch: str = "moderate",
|
|
timelimit: Optional[str] = None,
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout text search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
safesearch: on, moderate, off. Defaults to "moderate".
|
|
timelimit: d, w, m, y. Defaults to None.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with search results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
|
|
payload = {
|
|
"q": keywords,
|
|
"kl": region,
|
|
"p": safesearch_base[safesearch.lower()],
|
|
"o": "json",
|
|
"api": "d.js",
|
|
}
|
|
if timelimit:
|
|
payload["df"] = timelimit
|
|
if max_results and max_results > 20:
|
|
vqd = self._get_vqd(keywords)
|
|
payload["vqd"] = vqd
|
|
|
|
cache = set()
|
|
results: List[Dict[str, str]] = []
|
|
|
|
def _text_html_page(s: int) -> List[Dict[str, str]]:
|
|
payload["s"] = f"{s}"
|
|
resp_content = self._get_url("POST", "https://html.duckduckgo.com/html", data=payload)
|
|
if b"No results." in resp_content:
|
|
return []
|
|
|
|
page_results = []
|
|
tree = document_fromstring(resp_content, self.parser)
|
|
elements = tree.xpath("//div[h2]")
|
|
if not isinstance(elements, List):
|
|
return []
|
|
for e in elements:
|
|
if isinstance(e, _Element):
|
|
hrefxpath = e.xpath("./a/@href")
|
|
href = str(hrefxpath[0]) if isinstance(hrefxpath, List) else None
|
|
if (
|
|
href
|
|
and href not in cache
|
|
and not href.startswith(
|
|
("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
|
|
)
|
|
):
|
|
cache.add(href)
|
|
titlexpath = e.xpath("./h2/a/text()")
|
|
title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
|
|
bodyxpath = e.xpath("./a//text()")
|
|
body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
|
|
result = {
|
|
"title": _normalize(title),
|
|
"href": _normalize_url(href),
|
|
"body": _normalize(body),
|
|
}
|
|
page_results.append(result)
|
|
return page_results
|
|
|
|
slist = [0]
|
|
if max_results:
|
|
max_results = min(max_results, 500)
|
|
slist.extend(range(23, max_results, 50))
|
|
try:
|
|
for r in self._executor.map(_text_html_page, slist):
|
|
results.extend(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
return list(islice(results, max_results))
|
|
|
|
def _text_lite(
|
|
self,
|
|
keywords: str,
|
|
region: str = "wt-wt",
|
|
timelimit: Optional[str] = None,
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout text search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
timelimit: d, w, m, y. Defaults to None.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with search results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
payload = {
|
|
"q": keywords,
|
|
"o": "json",
|
|
"api": "d.js",
|
|
"kl": region,
|
|
}
|
|
if timelimit:
|
|
payload["df"] = timelimit
|
|
|
|
cache = set()
|
|
results: List[Dict[str, str]] = []
|
|
|
|
def _text_lite_page(s: int) -> List[Dict[str, str]]:
|
|
payload["s"] = f"{s}"
|
|
resp_content = self._get_url("POST", "https://lite.duckduckgo.com/lite/", data=payload)
|
|
if b"No more results." in resp_content:
|
|
return []
|
|
|
|
page_results = []
|
|
tree = document_fromstring(resp_content, self.parser)
|
|
elements = tree.xpath("//table[last()]//tr")
|
|
if not isinstance(elements, List):
|
|
return []
|
|
|
|
data = zip(cycle(range(1, 5)), elements)
|
|
for i, e in data:
|
|
if isinstance(e, _Element):
|
|
if i == 1:
|
|
hrefxpath = e.xpath(".//a//@href")
|
|
href = str(hrefxpath[0]) if hrefxpath and isinstance(hrefxpath, List) else None
|
|
if (
|
|
href is None
|
|
or href in cache
|
|
or href.startswith(
|
|
("http://www.google.com/search?q=", "https://duckduckgo.com/y.js?ad_domain")
|
|
)
|
|
):
|
|
[next(data, None) for _ in range(3)]
|
|
else:
|
|
cache.add(href)
|
|
titlexpath = e.xpath(".//a//text()")
|
|
title = str(titlexpath[0]) if isinstance(titlexpath, List) else ""
|
|
elif i == 2:
|
|
bodyxpath = e.xpath(".//td[@class='result-snippet']//text()")
|
|
body = "".join(str(x) for x in bodyxpath) if isinstance(bodyxpath, List) else ""
|
|
if href:
|
|
result = {
|
|
"title": _normalize(title),
|
|
"href": _normalize_url(href),
|
|
"body": _normalize(body),
|
|
}
|
|
page_results.append(result)
|
|
return page_results
|
|
|
|
slist = [0]
|
|
if max_results:
|
|
max_results = min(max_results, 500)
|
|
slist.extend(range(23, max_results, 50))
|
|
try:
|
|
for r in self._executor.map(_text_lite_page, slist):
|
|
results.extend(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
return list(islice(results, max_results))
|
|
|
|
def images(
|
|
self,
|
|
keywords: str,
|
|
region: str = "wt-wt",
|
|
safesearch: str = "moderate",
|
|
timelimit: Optional[str] = None,
|
|
size: Optional[str] = None,
|
|
color: Optional[str] = None,
|
|
type_image: Optional[str] = None,
|
|
layout: Optional[str] = None,
|
|
license_image: Optional[str] = None,
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout images search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
safesearch: on, moderate, off. Defaults to "moderate".
|
|
timelimit: Day, Week, Month, Year. Defaults to None.
|
|
size: Small, Medium, Large, Wallpaper. Defaults to None.
|
|
color: color, Monochrome, Red, Orange, Yellow, Green, Blue,
|
|
Purple, Pink, Brown, Black, Gray, Teal, White. Defaults to None.
|
|
type_image: photo, clipart, gif, transparent, line.
|
|
Defaults to None.
|
|
layout: Square, Tall, Wide. Defaults to None.
|
|
license_image: any (All Creative Commons), Public (PublicDomain),
|
|
Share (Free to Share and Use), ShareCommercially (Free to Share and Use Commercially),
|
|
Modify (Free to Modify, Share, and Use), ModifyCommercially (Free to Modify, Share, and
|
|
Use Commercially). Defaults to None.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with images search results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
vqd = self._get_vqd(keywords)
|
|
|
|
safesearch_base = {"on": "1", "moderate": "1", "off": "-1"}
|
|
timelimit = f"time:{timelimit}" if timelimit else ""
|
|
size = f"size:{size}" if size else ""
|
|
color = f"color:{color}" if color else ""
|
|
type_image = f"type:{type_image}" if type_image else ""
|
|
layout = f"layout:{layout}" if layout else ""
|
|
license_image = f"license:{license_image}" if license_image else ""
|
|
payload = {
|
|
"l": region,
|
|
"o": "json",
|
|
"q": keywords,
|
|
"vqd": vqd,
|
|
"f": f"{timelimit},{size},{color},{type_image},{layout},{license_image}",
|
|
"p": safesearch_base[safesearch.lower()],
|
|
}
|
|
|
|
cache = set()
|
|
results: List[Dict[str, str]] = []
|
|
|
|
def _images_page(s: int) -> List[Dict[str, str]]:
|
|
payload["s"] = f"{s}"
|
|
resp_content = self._get_url("GET", "https://duckduckgo.com/i.js", params=payload)
|
|
resp_json = json_loads(resp_content)
|
|
|
|
page_data = resp_json.get("results", [])
|
|
page_results = []
|
|
for row in page_data:
|
|
image_url = row.get("image")
|
|
if image_url and image_url not in cache:
|
|
cache.add(image_url)
|
|
result = {
|
|
"title": row["title"],
|
|
"image": _normalize_url(image_url),
|
|
"thumbnail": _normalize_url(row["thumbnail"]),
|
|
"url": _normalize_url(row["url"]),
|
|
"height": row["height"],
|
|
"width": row["width"],
|
|
"source": row["source"],
|
|
}
|
|
page_results.append(result)
|
|
return page_results
|
|
|
|
slist = [0]
|
|
if max_results:
|
|
max_results = min(max_results, 500)
|
|
slist.extend(range(100, max_results, 100))
|
|
try:
|
|
for r in self._executor.map(_images_page, slist):
|
|
results.extend(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
return list(islice(results, max_results))
|
|
|
|
def videos(
|
|
self,
|
|
keywords: str,
|
|
region: str = "wt-wt",
|
|
safesearch: str = "moderate",
|
|
timelimit: Optional[str] = None,
|
|
resolution: Optional[str] = None,
|
|
duration: Optional[str] = None,
|
|
license_videos: Optional[str] = None,
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout videos search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
safesearch: on, moderate, off. Defaults to "moderate".
|
|
timelimit: d, w, m. Defaults to None.
|
|
resolution: high, standart. Defaults to None.
|
|
duration: short, medium, long. Defaults to None.
|
|
license_videos: creativeCommon, youtube. Defaults to None.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with videos search results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
vqd = self._get_vqd(keywords)
|
|
|
|
safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
|
|
timelimit = f"publishedAfter:{timelimit}" if timelimit else ""
|
|
resolution = f"videoDefinition:{resolution}" if resolution else ""
|
|
duration = f"videoDuration:{duration}" if duration else ""
|
|
license_videos = f"videoLicense:{license_videos}" if license_videos else ""
|
|
payload = {
|
|
"l": region,
|
|
"o": "json",
|
|
"q": keywords,
|
|
"vqd": vqd,
|
|
"f": f"{timelimit},{resolution},{duration},{license_videos}",
|
|
"p": safesearch_base[safesearch.lower()],
|
|
}
|
|
|
|
cache = set()
|
|
results: List[Dict[str, str]] = []
|
|
|
|
def _videos_page(s: int) -> List[Dict[str, str]]:
|
|
payload["s"] = f"{s}"
|
|
resp_content = self._get_url("GET", "https://duckduckgo.com/v.js", params=payload)
|
|
resp_json = json_loads(resp_content)
|
|
|
|
page_data = resp_json.get("results", [])
|
|
page_results = []
|
|
for row in page_data:
|
|
if row["content"] not in cache:
|
|
cache.add(row["content"])
|
|
page_results.append(row)
|
|
return page_results
|
|
|
|
slist = [0]
|
|
if max_results:
|
|
max_results = min(max_results, 400)
|
|
slist.extend(range(59, max_results, 59))
|
|
try:
|
|
for r in self._executor.map(_videos_page, slist):
|
|
results.extend(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
return list(islice(results, max_results))
|
|
|
|
def news(
|
|
self,
|
|
keywords: str,
|
|
region: str = "wt-wt",
|
|
safesearch: str = "moderate",
|
|
timelimit: Optional[str] = None,
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout news search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
safesearch: on, moderate, off. Defaults to "moderate".
|
|
timelimit: d, w, m. Defaults to None.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with news search results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
vqd = self._get_vqd(keywords)
|
|
|
|
safesearch_base = {"on": "1", "moderate": "-1", "off": "-2"}
|
|
payload = {
|
|
"l": region,
|
|
"o": "json",
|
|
"noamp": "1",
|
|
"q": keywords,
|
|
"vqd": vqd,
|
|
"p": safesearch_base[safesearch.lower()],
|
|
}
|
|
if timelimit:
|
|
payload["df"] = timelimit
|
|
|
|
cache = set()
|
|
results: List[Dict[str, str]] = []
|
|
|
|
def _news_page(s: int) -> List[Dict[str, str]]:
|
|
payload["s"] = f"{s}"
|
|
resp_content = self._get_url("GET", "https://duckduckgo.com/news.js", params=payload)
|
|
resp_json = json_loads(resp_content)
|
|
page_data = resp_json.get("results", [])
|
|
page_results = []
|
|
for row in page_data:
|
|
if row["url"] not in cache:
|
|
cache.add(row["url"])
|
|
image_url = row.get("image", None)
|
|
result = {
|
|
"date": datetime.fromtimestamp(row["date"], timezone.utc).isoformat(),
|
|
"title": row["title"],
|
|
"body": _normalize(row["excerpt"]),
|
|
"url": _normalize_url(row["url"]),
|
|
"image": _normalize_url(image_url),
|
|
"source": row["source"],
|
|
}
|
|
page_results.append(result)
|
|
return page_results
|
|
|
|
slist = [0]
|
|
if max_results:
|
|
max_results = min(max_results, 200)
|
|
slist.extend(range(29, max_results, 29))
|
|
try:
|
|
for r in self._executor.map(_news_page, slist):
|
|
results.extend(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
return list(islice(results, max_results))
|
|
|
|
def answers(self, keywords: str) -> List[Dict[str, str]]:
|
|
"""Webscout instant answers. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query,
|
|
|
|
Returns:
|
|
List of dictionaries with instant answers results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
payload = {
|
|
"q": f"what is {keywords}",
|
|
"format": "json",
|
|
}
|
|
resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
|
|
page_data = json_loads(resp_content)
|
|
|
|
results = []
|
|
answer = page_data.get("AbstractText")
|
|
url = page_data.get("AbstractURL")
|
|
if answer:
|
|
results.append(
|
|
{
|
|
"icon": None,
|
|
"text": answer,
|
|
"topic": None,
|
|
"url": url,
|
|
}
|
|
)
|
|
|
|
|
|
payload = {
|
|
"q": f"{keywords}",
|
|
"format": "json",
|
|
}
|
|
resp_content = self._get_url("GET", "https://api.duckduckgo.com/", params=payload)
|
|
resp_json = json_loads(resp_content)
|
|
page_data = resp_json.get("RelatedTopics", [])
|
|
|
|
for row in page_data:
|
|
topic = row.get("Name")
|
|
if not topic:
|
|
icon = row["Icon"].get("URL")
|
|
results.append(
|
|
{
|
|
"icon": f"https://duckduckgo.com{icon}" if icon else "",
|
|
"text": row["Text"],
|
|
"topic": None,
|
|
"url": row["FirstURL"],
|
|
}
|
|
)
|
|
else:
|
|
for subrow in row["Topics"]:
|
|
icon = subrow["Icon"].get("URL")
|
|
results.append(
|
|
{
|
|
"icon": f"https://duckduckgo.com{icon}" if icon else "",
|
|
"text": subrow["Text"],
|
|
"topic": topic,
|
|
"url": subrow["FirstURL"],
|
|
}
|
|
)
|
|
|
|
return results
|
|
|
|
def suggestions(self, keywords: str, region: str = "wt-wt") -> List[Dict[str, str]]:
|
|
"""Webscout suggestions. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query.
|
|
region: wt-wt, us-en, uk-en, ru-ru, etc. Defaults to "wt-wt".
|
|
|
|
Returns:
|
|
List of dictionaries with suggestions results.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
payload = {
|
|
"q": keywords,
|
|
"kl": region,
|
|
}
|
|
resp_content = self._get_url("GET", "https://duckduckgo.com/ac/", params=payload)
|
|
page_data = json_loads(resp_content)
|
|
return [r for r in page_data]
|
|
|
|
def maps(
|
|
self,
|
|
keywords: str,
|
|
place: Optional[str] = None,
|
|
street: Optional[str] = None,
|
|
city: Optional[str] = None,
|
|
county: Optional[str] = None,
|
|
state: Optional[str] = None,
|
|
country: Optional[str] = None,
|
|
postalcode: Optional[str] = None,
|
|
latitude: Optional[str] = None,
|
|
longitude: Optional[str] = None,
|
|
radius: int = 0,
|
|
max_results: Optional[int] = None,
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout maps search. Query params: https://duckduckgo.com/params.
|
|
|
|
Args:
|
|
keywords: keywords for query
|
|
place: if set, the other parameters are not used. Defaults to None.
|
|
street: house number/street. Defaults to None.
|
|
city: city of search. Defaults to None.
|
|
county: county of search. Defaults to None.
|
|
state: state of search. Defaults to None.
|
|
country: country of search. Defaults to None.
|
|
postalcode: postalcode of search. Defaults to None.
|
|
latitude: geographic coordinate (north-south position). Defaults to None.
|
|
longitude: geographic coordinate (east-west position); if latitude and
|
|
longitude are set, the other parameters are not used. Defaults to None.
|
|
radius: expand the search square by the distance in kilometers. Defaults to 0.
|
|
max_results: max number of results. If None, returns results only from the first response. Defaults to None.
|
|
|
|
Returns:
|
|
List of dictionaries with maps search results, or None if there was an error.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
vqd = self._get_vqd(keywords)
|
|
|
|
|
|
if latitude and longitude:
|
|
lat_t = Decimal(latitude.replace(",", "."))
|
|
lat_b = Decimal(latitude.replace(",", "."))
|
|
lon_l = Decimal(longitude.replace(",", "."))
|
|
lon_r = Decimal(longitude.replace(",", "."))
|
|
if radius == 0:
|
|
radius = 1
|
|
|
|
else:
|
|
if place:
|
|
params = {
|
|
"q": place,
|
|
"polygon_geojson": "0",
|
|
"format": "jsonv2",
|
|
}
|
|
else:
|
|
params = {
|
|
"polygon_geojson": "0",
|
|
"format": "jsonv2",
|
|
}
|
|
if street:
|
|
params["street"] = street
|
|
if city:
|
|
params["city"] = city
|
|
if county:
|
|
params["county"] = county
|
|
if state:
|
|
params["state"] = state
|
|
if country:
|
|
params["country"] = country
|
|
if postalcode:
|
|
params["postalcode"] = postalcode
|
|
|
|
resp_content = self._get_url(
|
|
"GET",
|
|
"https://nominatim.openstreetmap.org/search.php",
|
|
params=params,
|
|
)
|
|
if resp_content == b"[]":
|
|
raise WebscoutE("maps() Coordinates are not found, check function parameters.")
|
|
resp_json = json_loads(resp_content)
|
|
coordinates = resp_json[0]["boundingbox"]
|
|
lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
|
|
lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
|
|
|
|
|
|
lat_t += Decimal(radius) * Decimal(0.008983)
|
|
lat_b -= Decimal(radius) * Decimal(0.008983)
|
|
lon_l -= Decimal(radius) * Decimal(0.008983)
|
|
lon_r += Decimal(radius) * Decimal(0.008983)
|
|
logger.debug(f"bbox coordinates\n{lat_t} {lon_l}\n{lat_b} {lon_r}")
|
|
|
|
cache = set()
|
|
results: List[Dict[str, str]] = []
|
|
|
|
def _maps_page(
|
|
bbox: Tuple[Decimal, Decimal, Decimal, Decimal],
|
|
) -> Optional[List[Dict[str, str]]]:
|
|
if max_results and len(results) >= max_results:
|
|
return None
|
|
lat_t, lon_l, lat_b, lon_r = bbox
|
|
params = {
|
|
"q": keywords,
|
|
"vqd": vqd,
|
|
"tg": "maps_places",
|
|
"rt": "D",
|
|
"mkexp": "b",
|
|
"wiki_info": "1",
|
|
"is_requery": "1",
|
|
"bbox_tl": f"{lat_t},{lon_l}",
|
|
"bbox_br": f"{lat_b},{lon_r}",
|
|
"strict_bbox": "1",
|
|
}
|
|
resp_content = self._get_url("GET", "https://duckduckgo.com/local.js", params=params)
|
|
resp_json = json_loads(resp_content)
|
|
page_data = resp_json.get("results", [])
|
|
|
|
page_results = []
|
|
for res in page_data:
|
|
r_name = f'{res["name"]} {res["address"]}'
|
|
if r_name in cache:
|
|
continue
|
|
else:
|
|
cache.add(r_name)
|
|
result = {
|
|
"title": res["name"],
|
|
"address": res["address"],
|
|
"country_code": res["country_code"],
|
|
"url": _normalize_url(res["website"]),
|
|
"phone": res["phone"] or "",
|
|
"latitude": res["coordinates"]["latitude"],
|
|
"longitude": res["coordinates"]["longitude"],
|
|
"source": _normalize_url(res["url"]),
|
|
"image": x.get("image", "") if (x := res["embed"]) else "",
|
|
"desc": x.get("description", "") if (x := res["embed"]) else "",
|
|
"hours": res["hours"] or "",
|
|
"category": res["ddg_category"] or "",
|
|
"facebook": f"www.facebook.com/profile.php?id={x}" if (x := res["facebook_id"]) else "",
|
|
"instagram": f"https://www.instagram.com/{x}" if (x := res["instagram_id"]) else "",
|
|
"twitter": f"https://twitter.com/{x}" if (x := res["twitter_id"]) else "",
|
|
}
|
|
page_results.append(result)
|
|
return page_results
|
|
|
|
|
|
start_bbox = (lat_t, lon_l, lat_b, lon_r)
|
|
work_bboxes = [start_bbox]
|
|
while work_bboxes:
|
|
queue_bboxes = []
|
|
tasks = []
|
|
for bbox in work_bboxes:
|
|
tasks.append(bbox)
|
|
|
|
if _calculate_distance(lat_t, lon_l, lat_b, lon_r) > 1:
|
|
lat_t, lon_l, lat_b, lon_r = bbox
|
|
lat_middle = (lat_t + lat_b) / 2
|
|
lon_middle = (lon_l + lon_r) / 2
|
|
bbox1 = (lat_t, lon_l, lat_middle, lon_middle)
|
|
bbox2 = (lat_t, lon_middle, lat_middle, lon_r)
|
|
bbox3 = (lat_middle, lon_l, lat_b, lon_middle)
|
|
bbox4 = (lat_middle, lon_middle, lat_b, lon_r)
|
|
queue_bboxes.extend([bbox1, bbox2, bbox3, bbox4])
|
|
|
|
|
|
work_bboxes_results = []
|
|
try:
|
|
for r in self._executor.map(_maps_page, tasks):
|
|
if r:
|
|
work_bboxes_results.extend(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
for x in work_bboxes_results:
|
|
if isinstance(x, list):
|
|
results.extend(x)
|
|
elif isinstance(x, dict):
|
|
results.append(x)
|
|
|
|
work_bboxes = queue_bboxes
|
|
if not max_results or len(results) >= max_results or len(work_bboxes_results) == 0:
|
|
break
|
|
|
|
return list(islice(results, max_results))
|
|
|
|
def translate(
|
|
self, keywords: Union[List[str], str], from_: Optional[str] = None, to: str = "en"
|
|
) -> List[Dict[str, str]]:
|
|
"""Webscout translate.
|
|
|
|
Args:
|
|
keywords: string or list of strings to translate.
|
|
from_: translate from (defaults automatically). Defaults to None.
|
|
to: what language to translate. Defaults to "en".
|
|
|
|
Returns:
|
|
List od dictionaries with translated keywords.
|
|
|
|
Raises:
|
|
WebscoutE: Base exception for webcout_search errors.
|
|
RatelimitE: Inherits from WebscoutE, raised for exceeding API request rate limits.
|
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
"""
|
|
assert keywords, "keywords is mandatory"
|
|
|
|
vqd = self._get_vqd("translate")
|
|
|
|
payload = {
|
|
"vqd": vqd,
|
|
"query": "translate",
|
|
"to": to,
|
|
}
|
|
if from_:
|
|
payload["from"] = from_
|
|
|
|
def _translate_keyword(keyword: str) -> Dict[str, str]:
|
|
resp_content = self._get_url(
|
|
"POST",
|
|
"https://duckduckgo.com/translation.js",
|
|
params=payload,
|
|
content=keyword.encode(),
|
|
)
|
|
page_data: Dict[str, str] = json_loads(resp_content)
|
|
page_data["original"] = keyword
|
|
return page_data
|
|
|
|
if isinstance(keywords, str):
|
|
keywords = [keywords]
|
|
|
|
results = []
|
|
try:
|
|
for r in self._executor.map(_translate_keyword, keywords):
|
|
results.append(r)
|
|
except Exception as e:
|
|
raise e
|
|
|
|
return results |