masadonline commited on
Commit
0ee59bd
·
verified ·
1 Parent(s): e2a3960

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -9
app.py CHANGED
@@ -14,30 +14,28 @@ import requests
14
  from streamlit_autorefresh import st_autorefresh
15
 
16
  # Extract text from PDF with fallback
 
17
  def extract_text_from_pdf(pdf_path):
18
  try:
19
  text = ""
20
  with open(pdf_path, 'rb') as file:
21
  pdf_reader = PyPDF2.PdfReader(file)
22
- for page in pdf_reader.pages:
 
23
  page_text = page.extract_text()
24
  if page_text:
25
  text += page_text
26
  return text
27
- except Exception as e:
28
- st.write(f"Fallback pdfminer extraction: {e}")
29
  return extract_text(pdf_path)
30
 
31
- # Extract text from DOCX
32
  def extract_text_from_docx(docx_path):
33
  try:
34
  doc = docx.Document(docx_path)
35
  return '\n'.join(para.text for para in doc.paragraphs)
36
- except Exception as e:
37
- st.write(f"Docx extraction error: {e}")
38
  return ""
39
 
40
- # Chunk text based on tokens
41
  def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
42
  tokens = tokenizer.tokenize(text)
43
  chunks, start = [], 0
@@ -48,10 +46,9 @@ def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
48
  start += chunk_size - chunk_overlap
49
  return chunks
50
 
51
- # Retrieve relevant chunks from index
52
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
53
  question_embedding = embed_model.encode([question])[0]
54
- D, I = index.search(np.array([question_embedding]).astype('float32'), k)
55
  return [text_chunks[i] for i in I[0]]
56
 
57
  # Generate answer using Groq API with retries and timeout
 
14
  from streamlit_autorefresh import st_autorefresh
15
 
16
  # Extract text from PDF with fallback
17
+ # --- Document Loaders ---
18
  def extract_text_from_pdf(pdf_path):
19
  try:
20
  text = ""
21
  with open(pdf_path, 'rb') as file:
22
  pdf_reader = PyPDF2.PdfReader(file)
23
+ for page_num in range(len(pdf_reader.pages)):
24
+ page = pdf_reader.pages[page_num]
25
  page_text = page.extract_text()
26
  if page_text:
27
  text += page_text
28
  return text
29
+ except:
 
30
  return extract_text(pdf_path)
31
 
 
32
  def extract_text_from_docx(docx_path):
33
  try:
34
  doc = docx.Document(docx_path)
35
  return '\n'.join(para.text for para in doc.paragraphs)
36
+ except:
 
37
  return ""
38
 
 
39
  def chunk_text(text, tokenizer, chunk_size=150, chunk_overlap=30):
40
  tokens = tokenizer.tokenize(text)
41
  chunks, start = [], 0
 
46
  start += chunk_size - chunk_overlap
47
  return chunks
48
 
 
49
  def retrieve_chunks(question, index, embed_model, text_chunks, k=3):
50
  question_embedding = embed_model.encode([question])[0]
51
+ D, I = index.search(np.array([question_embedding]), k)
52
  return [text_chunks[i] for i in I[0]]
53
 
54
  # Generate answer using Groq API with retries and timeout