disLodge commited on
Commit
baf459f
·
verified ·
1 Parent(s): c012b3d

new changes

Browse files
Files changed (1) hide show
  1. app.py +8 -49
app.py CHANGED
@@ -17,56 +17,15 @@ logger = logging.getLogger(__name__)
17
 
18
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
19
 
20
- def extract_pdf_text(url: str, fallback_url: str = None) -> str:
21
- try:
22
- logger.info(f"Attempting to download PDF from {url}")
23
- response = requests.get(url, timeout=10)
24
- response.raise_for_status()
25
-
26
- content_type = response.headers.get("content-type", "")
27
- if "application/pdf" not in content_type.lower():
28
- logger.warning(f"URL {url} does not point to a PDF. Content-Type: {content_type}")
29
- if fallback_url:
30
- logger.info(f"Falling back to {fallback_url}")
31
- return extract_pdf_text(fallback_url)
32
- raise ValueError("Downloaded file is not a PDF")
33
-
34
- pdf_file = BytesIO(response.content)
35
- logger.info(f"Extracting text from PDF (size: {len(response.content)} bytes)")
36
- text = extract_text(pdf_file)
37
-
38
- if not text.strip():
39
- logger.warning("Extracted text is empty")
40
- if fallback_url:
41
- logger.info(f"Falling back to {fallback_url}")
42
- return extract_pdf_text(fallback_url)
43
- raise ValueError("No text could be extracted from the PDF")
44
-
45
- logger.info("PDF text extracted successfully")
46
- return text
47
-
48
- except requests.exceptions.RequestException as e:
49
- logger.error(f"Failed to download PDF from {url}: {e}")
50
- if fallback_url:
51
- logger.info(f"Falling back to {fallback_url}")
52
- return extract_pdf_text(fallback_url)
53
- raise
54
- except Exception as e:
55
- logger.error(f"Error processing PDF from {url}: {e}")
56
- if fallback_url:
57
- logger.info(f"Falling back to {fallback_url}")
58
- return extract_pdf_text(fallback_url)
59
- raise
60
-
61
-
62
- pdf_url = "https://huggingface.co/spaces/disLodge/Call_model/raw/main/temp.pdf"
63
- fallback_pdf_url = "https://arxiv.org/pdf/2408.09869"
64
- try:
65
- text = extract_pdf_text(pdf_url, fallback_url=fallback_pdf_url)
66
- except Exception as e:
67
- logger.error(f"Failed to process PDF: {e}")
68
- raise
69
 
 
 
 
70
  docs_list = [Document(page_content=text)]
71
 
72
  text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)
 
17
 
18
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
19
 
20
+ def extract_pdf_text(url: str) -> str:
21
+ response = requests.get(url)
22
+ pdf_file = BytesIO(response.content)
23
+ text = extract_text(pdf_file)
24
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
+
27
+ pdf_url = "https://arxiv.org/pdf/2408.09869"
28
+ text = extract_pdf_text(pdf_url)
29
  docs_list = [Document(page_content=text)]
30
 
31
  text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=7500, chunk_overlap=100)