Spaces:

Arxived
/

quick-spin

Sleeping

App Files Files Community

quick-spin / patent_downloader.py

DrishtiSharma

Create patent_downloader.py

2375a67 verified 12 months ago

raw

history blame

4.38 kB

	from typing import List, Union, Optional
	import os
	import requests
	import tempfile
	from bs4 import BeautifulSoup


	class PatentDownloader:
	"""
	A class to automate downloading patent PDFs from Google Patents.
	"""
	base_url = "https://patents.google.com/patent"

	def __init__(self, verbose: bool = False):
	"""
	Initialize the downloader.
	Parameters
	----------
	verbose : bool
	If True, print detailed debug information.
	"""
	self.verbose = verbose

	def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]:
	"""
	Download single or multiple patent PDFs.
	Parameters
	----------
	patents : str or List[str]
	Single patent number or a list of patent numbers.
	output_path : Optional[str]
	Directory to save the PDFs. Defaults to a temporary directory.
	Returns
	-------
	List[str]
	List of paths to the downloaded PDFs.
	"""
	if isinstance(patents, str):
	patents = [patents]

	# Use a temporary directory if no output path is provided
	output_dir = output_path or tempfile.gettempdir()
	os.makedirs(output_dir, exist_ok=True)

	downloaded_files = []

	for i, patent in enumerate(patents):
	try:
	if self.verbose:
	print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}")
	file_path = self._download_single_pdf(patent, output_dir)
	downloaded_files.append(file_path)
	print(f"✅ Successfully downloaded: {file_path}")
	except Exception as e:
	print(f"❌ Failed to download {patent}: {e}")

	return downloaded_files

	def _download_single_pdf(self, patent_number: str, output_dir: str) -> str:
	"""
	Download a single patent PDF.
	Parameters
	----------
	patent_number : str
	The patent number (e.g., "US8676427B1").
	output_dir : str
	Directory to save the PDF.
	Returns
	-------
	str
	Path to the downloaded PDF file.
	"""
	# Construct the Google Patents URL
	patent_url = f"{self.base_url}/{patent_number}/en"

	if self.verbose:
	print(f"Fetching patent page: {patent_url}")

	# Fetch the HTML content of the patent page
	response = requests.get(patent_url)
	if response.status_code != 200:
	raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}")

	# Parse the HTML content and extract the PDF link
	soup = BeautifulSoup(response.content, "html.parser")
	pdf_url = self._extract_pdf_link(soup)

	if not pdf_url:
	raise Exception(f"No PDF link found for patent {patent_number}.")

	if self.verbose:
	print(f"Found PDF link: {pdf_url}")

	# Download the PDF file
	pdf_response = requests.get(pdf_url)
	if pdf_response.status_code != 200:
	raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}")

	# Save the PDF to the specified output directory
	file_path = os.path.join(output_dir, f"{patent_number}.pdf")
	with open(file_path, "wb") as pdf_file:
	pdf_file.write(pdf_response.content)

	return file_path

	@staticmethod
	def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]:
	"""
	Extract the PDF link from the page's metadata.
	Parameters
	----------
	soup : BeautifulSoup
	Parsed HTML content of the patent page.
	Returns
	-------
	Optional[str]
	The direct PDF link if found.
	"""
	# Look for the 'citation_pdf_url' meta tag
	pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
	if pdf_meta and pdf_meta.get("content"):
	return pdf_meta["content"]

	# Fallback: search for any <a> tag containing '.pdf' in its href
	pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]
	if pdf_links:
	return pdf_links[0] # Return the first matching PDF link

	return None