Spaces:

disLodge
/

Call_model

Sleeping

App Files Files Community

disLodge commited on May 6

Commit

4bd3448

verified ·

1 Parent(s): c1a54d8

PDF issue fixes

Browse files

Files changed (1) hide show

app.py +53 -7

app.py CHANGED Viewed

@@ -10,17 +10,63 @@ from langchain_core.documents import Document
 from langchain_core.prompts import ChatPromptTemplate
 from langchain.text_splitter import CharacterTextSplitter
 from huggingface_hub import InferenceClient
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def extract_pdf_text(url: str) -> str:
-    response = requests.get(url)
-    pdf_file = BytesIO(response.content)
-    text = extract_text(pdf_file)
-    return text
 pdf_url = "https://huggingface.co/spaces/disLodge/Call_model/raw/main/temp.pdf"
-text = extract_pdf_text(pdf_url)
 docs_list = [Document(page_content=text)]
 text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)

 from langchain_core.prompts import ChatPromptTemplate
 from langchain.text_splitter import CharacterTextSplitter
 from huggingface_hub import InferenceClient
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
+def extract_pdf_text(url: str, fallback_url: str = None) -> str:
+    try:
+        logger.info(f"Attempting to download PDF from {url}")
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        content_type = response.headers.get("content-type", "")
+        if "application/pdf" not in content_type.lower():
+                logger.warning(f"URL {url} does not point to a PDF. Content-Type: {content_type}")
+                if fallback_url:
+                    logger.info(f"Falling back to {fallback_url}")
+                    return extract_pdf_text(fallback_url)
+                raise ValueError("Downloaded file is not a PDF")
+        pdf_file = BytesIO(response.content)
+        logger.info(f"Extracting text from PDF (size: {len(response.content)} bytes)")
+        text = extract_text(pdf_file)
+        if not text.strip():
+            logger.warning("Extracted text is empty")
+            if fallback_url:
+                logger.info(f"Falling back to {fallback_url}")
+                return extract_pdf_text(fallback_url)
+            raise ValueError("No text could be extracted from the PDF")
+        logger.info("PDF text extracted successfully")
+        return text
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download PDF from {url}: {e}")
+        if fallback_url:
+            logger.info(f"Falling back to {fallback_url}")
+            return extract_pdf_text(fallback_url)
+        raise
+    except Exception as e:
+        logger.error(f"Error processing PDF from {url}: {e}")
+        if fallback_url:
+            logger.info(f"Falling back to {fallback_url}")
+            return extract_pdf_text(fallback_url)
+        raise
 pdf_url = "https://huggingface.co/spaces/disLodge/Call_model/raw/main/temp.pdf"
+fallback_pdf_url = "https://arxiv.org/pdf/2408.09869"
+try:
+    text = extract_pdf_text(pdf_url, fallback_url=fallback_pdf_url)
+except Exception as e:
+    logger.error(f"Failed to process PDF: {e}")
+    raise
 docs_list = [Document(page_content=text)]
 text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)