Spaces:
Sleeping
Sleeping
| from typing import List, Union, Optional | |
| import os | |
| import requests | |
| import tempfile | |
| from bs4 import BeautifulSoup | |
| class PatentDownloader: | |
| """ | |
| A class to automate downloading patent PDFs from Google Patents. | |
| """ | |
| base_url = "https://patents.google.com/patent" | |
| def __init__(self, verbose: bool = False): | |
| """ | |
| Initialize the downloader. | |
| Parameters | |
| ---------- | |
| verbose : bool | |
| If True, print detailed debug information. | |
| """ | |
| self.verbose = verbose | |
| def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]: | |
| """ | |
| Download single or multiple patent PDFs. | |
| Parameters | |
| ---------- | |
| patents : str or List[str] | |
| Single patent number or a list of patent numbers. | |
| output_path : Optional[str] | |
| Directory to save the PDFs. Defaults to a temporary directory. | |
| Returns | |
| ------- | |
| List[str] | |
| List of paths to the downloaded PDFs. | |
| """ | |
| if isinstance(patents, str): | |
| patents = [patents] | |
| # Use a temporary directory if no output path is provided | |
| output_dir = output_path or tempfile.gettempdir() | |
| os.makedirs(output_dir, exist_ok=True) | |
| downloaded_files = [] | |
| for i, patent in enumerate(patents): | |
| try: | |
| if self.verbose: | |
| print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}") | |
| file_path = self._download_single_pdf(patent, output_dir) | |
| downloaded_files.append(file_path) | |
| print(f"✅ Successfully downloaded: {file_path}") | |
| except Exception as e: | |
| print(f"❌ Failed to download {patent}: {e}") | |
| return downloaded_files | |
| def _download_single_pdf(self, patent_number: str, output_dir: str) -> str: | |
| """ | |
| Download a single patent PDF. | |
| Parameters | |
| ---------- | |
| patent_number : str | |
| The patent number (e.g., "US8676427B1"). | |
| output_dir : str | |
| Directory to save the PDF. | |
| Returns | |
| ------- | |
| str | |
| Path to the downloaded PDF file. | |
| """ | |
| # Construct the Google Patents URL | |
| patent_url = f"{self.base_url}/{patent_number}/en" | |
| if self.verbose: | |
| print(f"Fetching patent page: {patent_url}") | |
| # Fetch the HTML content of the patent page | |
| response = requests.get(patent_url) | |
| if response.status_code != 200: | |
| raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}") | |
| # Parse the HTML content and extract the PDF link | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| pdf_url = self._extract_pdf_link(soup) | |
| if not pdf_url: | |
| raise Exception(f"No PDF link found for patent {patent_number}.") | |
| if self.verbose: | |
| print(f"Found PDF link: {pdf_url}") | |
| # Download the PDF file | |
| pdf_response = requests.get(pdf_url) | |
| if pdf_response.status_code != 200: | |
| raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}") | |
| # Save the PDF to the specified output directory | |
| file_path = os.path.join(output_dir, f"{patent_number}.pdf") | |
| with open(file_path, "wb") as pdf_file: | |
| pdf_file.write(pdf_response.content) | |
| return file_path | |
| def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]: | |
| """ | |
| Extract the PDF link from the page's metadata. | |
| Parameters | |
| ---------- | |
| soup : BeautifulSoup | |
| Parsed HTML content of the patent page. | |
| Returns | |
| ------- | |
| Optional[str] | |
| The direct PDF link if found. | |
| """ | |
| # Look for the 'citation_pdf_url' meta tag | |
| pdf_meta = soup.find("meta", {"name": "citation_pdf_url"}) | |
| if pdf_meta and pdf_meta.get("content"): | |
| return pdf_meta["content"] | |
| # Fallback: search for any <a> tag containing '.pdf' in its href | |
| pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")] | |
| if pdf_links: | |
| return pdf_links[0] # Return the first matching PDF link | |
| return None | |