Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,30 +14,28 @@ import requests
|
|
14 |
from streamlit_autorefresh import st_autorefresh
|
15 |
|
16 |
# Extract text from PDF with fallback
|
|
|
17 |
def extract_text_from_pdf(pdf_path):
|
18 |
try:
|
19 |
text = ""
|
20 |
with open(pdf_path, 'rb') as file:
|
21 |
pdf_reader = PyPDF2.PdfReader(file)
|
22 |
-
for
|
|
|
23 |
page_text = page.extract_text()
|
24 |
if page_text:
|
25 |
text += page_text
|
26 |
return text
|
27 |
-
except
|
28 |
-
st.write(f"Fallback pdfminer extraction: {e}")
|
29 |
return extract_text(pdf_path)
|
30 |
|
31 |
-
# Extract text from DOCX
|
32 |
def extract_text_from_docx(docx_path):
|
33 |
try:
|
34 |
doc = docx.Document(docx_path)
|
35 |
return '\n'.join(para.text for para in doc.paragraphs)
|
36 |
-
except
|
37 |
-
st.write(f"Docx extraction error: {e}")
|
38 |
return ""
|
39 |
|
40 |
-
# Chunk text based on tokens
|
41 |
def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
|
42 |
tokens = tokenizer.tokenize(text)
|
43 |
chunks, start = [], 0
|
@@ -48,10 +46,9 @@ def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
|
|
48 |
start += chunk_size - chunk_overlap
|
49 |
return chunks
|
50 |
|
51 |
-
# Retrieve relevant chunks from index
|
52 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
53 |
question_embedding = embed_model.encode([question])[0]
|
54 |
-
D, I = index.search(np.array([question_embedding])
|
55 |
return [text_chunks[i] for i in I[0]]
|
56 |
|
57 |
# Generate answer using Groq API with retries and timeout
|
|
|
14 |
from streamlit_autorefresh import st_autorefresh
|
15 |
|
16 |
# Extract text from PDF with fallback
|
17 |
+
# --- Document Loaders ---
|
18 |
def extract_text_from_pdf(pdf_path):
|
19 |
try:
|
20 |
text = ""
|
21 |
with open(pdf_path, 'rb') as file:
|
22 |
pdf_reader = PyPDF2.PdfReader(file)
|
23 |
+
for page_num in range(len(pdf_reader.pages)):
|
24 |
+
page = pdf_reader.pages[page_num]
|
25 |
page_text = page.extract_text()
|
26 |
if page_text:
|
27 |
text += page_text
|
28 |
return text
|
29 |
+
except:
|
|
|
30 |
return extract_text(pdf_path)
|
31 |
|
|
|
32 |
def extract_text_from_docx(docx_path):
|
33 |
try:
|
34 |
doc = docx.Document(docx_path)
|
35 |
return '\n'.join(para.text for para in doc.paragraphs)
|
36 |
+
except:
|
|
|
37 |
return ""
|
38 |
|
|
|
39 |
def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
|
40 |
tokens = tokenizer.tokenize(text)
|
41 |
chunks, start = [], 0
|
|
|
46 |
start += chunk_size - chunk_overlap
|
47 |
return chunks
|
48 |
|
|
|
49 |
def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
|
50 |
question_embedding = embed_model.encode([question])[0]
|
51 |
+
D, I = index.search(np.array([question_embedding]), k)
|
52 |
return [text_chunks[i] for i in I[0]]
|
53 |
|
54 |
# Generate answer using Groq API with retries and timeout
|