Spaces:

VelaTest
/

PDFExtractor

Sleeping

File size: 2,369 Bytes

172e21d

import os
import requests
from application.utils.logger import get_logger

from langchain_core.tools import tool


logger = get_logger()

@tool
def download_pdf(filename:str, url: str, save_path: str = "reports", overwrite: bool = False):
    """

    Downloads a PDF file from a given URL ('pdf_link') and saves it locally

    with the specified 'filename'. Returns the local path if successful, otherwise None.

    Use this tool AFTER get_sustainability_report_pdf has returned a valid PDF link or if user provides the PDF link.



    Args:

        filename (str): The name to save the PDF as (should end with .pdf).

        url (str): The direct URL to the PDF file.

        save_path (str): The directory to save the PDF into (default: "reports").

        overwrite (bool): Whether to overwrite the file if it already exists.



    Returns:

        str | None: The path to the saved file if successful, otherwise None.

    """
    try:
        # parsed_url = urlparse(url)
        # filename = os.path.basename(parsed_url.path)

        if not filename.lower().endswith(".pdf"):
            logger.warning(f"URL does not point to a PDF file: {url}")
            return None

        os.makedirs(save_path, exist_ok=True)
        full_path = os.path.join(save_path, filename)

        if os.path.exists(full_path) and not overwrite:
            logger.info(f"File already exists, skipping download: {full_path}")
            return full_path

        logger.info(f"Starting download from {url}")

        response = requests.get(url, stream=True, timeout=20)
        response.raise_for_status()

        with open(full_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    file.write(chunk)

        logger.info(f"Successfully downloaded to: {full_path}")
        return full_path

    except requests.exceptions.Timeout:
        logger.error(f"Timeout while downloading {url}")
    except requests.exceptions.HTTPError as http_err:
        logger.error(f"HTTP error while downloading {url}: {http_err}")
    except requests.exceptions.RequestException as req_err:
        logger.error(f"Request error while downloading {url}: {req_err}")
    except Exception as e:
        logger.error(f"Unexpected error: {e}")

    return None