disLodge commited on
Commit
4bd3448
·
verified ·
1 Parent(s): c1a54d8

PDF issue fixes

Browse files
Files changed (1) hide show
  1. app.py +53 -7
app.py CHANGED
@@ -10,17 +10,63 @@ from langchain_core.documents import Document
10
  from langchain_core.prompts import ChatPromptTemplate
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from huggingface_hub import InferenceClient
 
13
 
14
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
15
 
16
- def extract_pdf_text(url: str) -> str:
17
- response = requests.get(url)
18
- pdf_file = BytesIO(response.content)
19
- text = extract_text(pdf_file)
20
- return text
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  pdf_url = "https://huggingface.co/spaces/disLodge/Call_model/raw/main/temp.pdf"
23
- text = extract_pdf_text(pdf_url)
 
 
 
 
 
 
24
  docs_list = [Document(page_content=text)]
25
 
26
  text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
 
10
  from langchain_core.prompts import ChatPromptTemplate
11
  from langchain.text_splitter import CharacterTextSplitter
12
  from huggingface_hub import InferenceClient
13
+ import logging
14
 
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
 
18
+ client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
19
 
20
+ def extract_pdf_text(url: str, fallback_url: str = None) -> str:
21
+ try:
22
+ logger.info(f"Attempting to download PDF from {url}")
23
+ response = requests.get(url, timeout=10)
24
+ response.raise_for_status()
25
+
26
+ content_type = response.headers.get("content-type", "")
27
+ if "application/pdf" not in content_type.lower():
28
+ logger.warning(f"URL {url} does not point to a PDF. Content-Type: {content_type}")
29
+ if fallback_url:
30
+ logger.info(f"Falling back to {fallback_url}")
31
+ return extract_pdf_text(fallback_url)
32
+ raise ValueError("Downloaded file is not a PDF")
33
+
34
+ pdf_file = BytesIO(response.content)
35
+ logger.info(f"Extracting text from PDF (size: {len(response.content)} bytes)")
36
+ text = extract_text(pdf_file)
37
+
38
+ if not text.strip():
39
+ logger.warning("Extracted text is empty")
40
+ if fallback_url:
41
+ logger.info(f"Falling back to {fallback_url}")
42
+ return extract_pdf_text(fallback_url)
43
+ raise ValueError("No text could be extracted from the PDF")
44
+
45
+ logger.info("PDF text extracted successfully")
46
+ return text
47
+
48
+ except requests.exceptions.RequestException as e:
49
+ logger.error(f"Failed to download PDF from {url}: {e}")
50
+ if fallback_url:
51
+ logger.info(f"Falling back to {fallback_url}")
52
+ return extract_pdf_text(fallback_url)
53
+ raise
54
+ except Exception as e:
55
+ logger.error(f"Error processing PDF from {url}: {e}")
56
+ if fallback_url:
57
+ logger.info(f"Falling back to {fallback_url}")
58
+ return extract_pdf_text(fallback_url)
59
+ raise
60
+
61
+
62
  pdf_url = "https://huggingface.co/spaces/disLodge/Call_model/raw/main/temp.pdf"
63
+ fallback_pdf_url = "https://arxiv.org/pdf/2408.09869"
64
+ try:
65
+ text = extract_pdf_text(pdf_url, fallback_url=fallback_pdf_url)
66
+ except Exception as e:
67
+ logger.error(f"Failed to process PDF: {e}")
68
+ raise
69
+
70
  docs_list = [Document(page_content=text)]
71
 
72
  text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)