masadonline commited on
Commit
7be2761
·
verified ·
1 Parent(s): b594dbc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -8
app.py CHANGED
@@ -5,28 +5,45 @@ import streamlit as st
5
  from twilio.rest import Client
6
  from sentence_transformers import SentenceTransformer
7
  from transformers import AutoTokenizer
8
-
9
  import faiss
10
  import numpy as np
11
  import docx
12
-
13
  from groq import Groq
14
  import requests
15
  from io import StringIO
16
  from pdfminer.high_level import extract_text_to_fp
17
  from pdfminer.layout import LAParams
18
  from twilio.base.exceptions import TwilioRestException # Add this at the top
 
19
  import datetime
20
 
21
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
22
 
23
  os.environ["PYTORCH_JIT"] = "0"
24
  # --- PDF Extraction ---
 
25
  def extract_text_from_pdf(pdf_path):
26
- output_string = StringIO()
27
- with open(pdf_path, 'rb') as file:
28
- extract_text_to_fp(file, output_string, laparams=LAParams(), output_type='text', codec=None)
29
- return output_string.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def clean_extracted_text(text):
32
  lines = text.splitlines()
@@ -38,12 +55,20 @@ def clean_extracted_text(text):
38
  cleaned.append(line)
39
  return '\n'.join(cleaned)
40
 
 
 
 
 
 
 
 
 
41
  # --- DOCX Extraction ---
42
  def extract_text_from_docx(docx_path):
43
  try:
44
  doc = docx.Document(docx_path)
45
  return '\n'.join(para.text for para in doc.paragraphs)
46
- except:
47
  return ""
48
 
49
  # --- Chunking ---
@@ -117,7 +142,7 @@ def fetch_latest_incoming_message(client, conversation_sid):
117
  else:
118
  print(f"Twilio error fetching messages for {conversation_sid}:", e)
119
  except Exception as e:
120
- print(f"Unexpected error in fetch_latest_incoming_message for {conversation_sid}:", e)
121
 
122
  return None
123
 
 
5
  from twilio.rest import Client
6
  from sentence_transformers import SentenceTransformer
7
  from transformers import AutoTokenizer
 
8
  import faiss
9
  import numpy as np
10
  import docx
 
11
  from groq import Groq
12
  import requests
13
  from io import StringIO
14
  from pdfminer.high_level import extract_text_to_fp
15
  from pdfminer.layout import LAParams
16
  from twilio.base.exceptions import TwilioRestException # Add this at the top
17
+ import pdfplumber
18
  import datetime
19
 
20
  APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
21
 
22
  os.environ["PYTORCH_JIT"] = "0"
23
  # --- PDF Extraction ---
24
+ # --- PDF Extraction ---
25
  def extract_text_from_pdf(pdf_path):
26
+ text_output = StringIO()
27
+ tables = []
28
+ try:
29
+ with pdfplumber.open(pdf_path) as pdf:
30
+ for page in pdf.pages:
31
+ # Extract tables
32
+ page_tables = page.extract_tables()
33
+ if page_tables:
34
+ tables.extend(page_tables)
35
+ # Extract text
36
+ text = page.extract_text()
37
+ if text:
38
+ text_output.write(text + "\n\n")
39
+ except Exception as e:
40
+ print(f"Error extracting with pdfplumber: {e}")
41
+ # Fallback to pdfminer if pdfplumber fails
42
+ with open(pdf_path, 'rb') as file:
43
+ extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
44
+ extracted_text = text_output.getvalue()
45
+ formatted_tables = _format_tables_internal(tables)
46
+ return f"{extracted_text}\n\n{formatted_tables}"
47
 
48
  def clean_extracted_text(text):
49
  lines = text.splitlines()
 
55
  cleaned.append(line)
56
  return '\n'.join(cleaned)
57
 
58
+ def _format_tables_internal(tables):
59
+ formatted_tables = []
60
+ for table in tables:
61
+ # Basic formatting: joining rows with '|' and cells with ','
62
+ formatted_table = "\n".join(["|".join(row) for row in table])
63
+ formatted_tables.append(f"<table data>\n{formatted_table}\n</table>")
64
+ return "\n\n".join(formatted_tables)
65
+
66
  # --- DOCX Extraction ---
67
  def extract_text_from_docx(docx_path):
68
  try:
69
  doc = docx.Document(docx_path)
70
  return '\n'.join(para.text for para in doc.paragraphs)
71
+ except Exception:
72
  return ""
73
 
74
  # --- Chunking ---
 
142
  else:
143
  print(f"Twilio error fetching messages for {conversation_sid}:", e)
144
  except Exception as e:
145
+ #print(f"Unexpected error in fetch_latest_incoming_message for {conversation_sid}:", e)
146
 
147
  return None
148