Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,28 +5,45 @@ import streamlit as st
|
|
5 |
from twilio.rest import Client
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from transformers import AutoTokenizer
|
8 |
-
|
9 |
import faiss
|
10 |
import numpy as np
|
11 |
import docx
|
12 |
-
|
13 |
from groq import Groq
|
14 |
import requests
|
15 |
from io import StringIO
|
16 |
from pdfminer.high_level import extract_text_to_fp
|
17 |
from pdfminer.layout import LAParams
|
18 |
from twilio.base.exceptions import TwilioRestException # Add this at the top
|
|
|
19 |
import datetime
|
20 |
|
21 |
APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
|
22 |
|
23 |
os.environ["PYTORCH_JIT"] = "0"
|
24 |
# --- PDF Extraction ---
|
|
|
25 |
def extract_text_from_pdf(pdf_path):
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
def clean_extracted_text(text):
|
32 |
lines = text.splitlines()
|
@@ -38,12 +55,20 @@ def clean_extracted_text(text):
|
|
38 |
cleaned.append(line)
|
39 |
return '\n'.join(cleaned)
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
# --- DOCX Extraction ---
|
42 |
def extract_text_from_docx(docx_path):
|
43 |
try:
|
44 |
doc = docx.Document(docx_path)
|
45 |
return '\n'.join(para.text for para in doc.paragraphs)
|
46 |
-
except:
|
47 |
return ""
|
48 |
|
49 |
# --- Chunking ---
|
@@ -117,7 +142,7 @@ def fetch_latest_incoming_message(client, conversation_sid):
|
|
117 |
else:
|
118 |
print(f"Twilio error fetching messages for {conversation_sid}:", e)
|
119 |
except Exception as e:
|
120 |
-
print(f"Unexpected error in fetch_latest_incoming_message for {conversation_sid}:", e)
|
121 |
|
122 |
return None
|
123 |
|
|
|
5 |
from twilio.rest import Client
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from transformers import AutoTokenizer
|
|
|
8 |
import faiss
|
9 |
import numpy as np
|
10 |
import docx
|
|
|
11 |
from groq import Groq
|
12 |
import requests
|
13 |
from io import StringIO
|
14 |
from pdfminer.high_level import extract_text_to_fp
|
15 |
from pdfminer.layout import LAParams
|
16 |
from twilio.base.exceptions import TwilioRestException # Add this at the top
|
17 |
+
import pdfplumber
|
18 |
import datetime
|
19 |
|
20 |
APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
|
21 |
|
22 |
os.environ["PYTORCH_JIT"] = "0"
|
23 |
# --- PDF Extraction ---
|
24 |
+
# --- PDF Extraction ---
|
25 |
def extract_text_from_pdf(pdf_path):
|
26 |
+
text_output = StringIO()
|
27 |
+
tables = []
|
28 |
+
try:
|
29 |
+
with pdfplumber.open(pdf_path) as pdf:
|
30 |
+
for page in pdf.pages:
|
31 |
+
# Extract tables
|
32 |
+
page_tables = page.extract_tables()
|
33 |
+
if page_tables:
|
34 |
+
tables.extend(page_tables)
|
35 |
+
# Extract text
|
36 |
+
text = page.extract_text()
|
37 |
+
if text:
|
38 |
+
text_output.write(text + "\n\n")
|
39 |
+
except Exception as e:
|
40 |
+
print(f"Error extracting with pdfplumber: {e}")
|
41 |
+
# Fallback to pdfminer if pdfplumber fails
|
42 |
+
with open(pdf_path, 'rb') as file:
|
43 |
+
extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
|
44 |
+
extracted_text = text_output.getvalue()
|
45 |
+
formatted_tables = _format_tables_internal(tables)
|
46 |
+
return f"{extracted_text}\n\n{formatted_tables}"
|
47 |
|
48 |
def clean_extracted_text(text):
|
49 |
lines = text.splitlines()
|
|
|
55 |
cleaned.append(line)
|
56 |
return '\n'.join(cleaned)
|
57 |
|
58 |
+
def _format_tables_internal(tables):
|
59 |
+
formatted_tables = []
|
60 |
+
for table in tables:
|
61 |
+
# Basic formatting: joining rows with '|' and cells with ','
|
62 |
+
formatted_table = "\n".join(["|".join(row) for row in table])
|
63 |
+
formatted_tables.append(f"<table data>\n{formatted_table}\n</table>")
|
64 |
+
return "\n\n".join(formatted_tables)
|
65 |
+
|
66 |
# --- DOCX Extraction ---
|
67 |
def extract_text_from_docx(docx_path):
|
68 |
try:
|
69 |
doc = docx.Document(docx_path)
|
70 |
return '\n'.join(para.text for para in doc.paragraphs)
|
71 |
+
except Exception:
|
72 |
return ""
|
73 |
|
74 |
# --- Chunking ---
|
|
|
142 |
else:
|
143 |
print(f"Twilio error fetching messages for {conversation_sid}:", e)
|
144 |
except Exception as e:
|
145 |
+
#print(f"Unexpected error in fetch_latest_incoming_message for {conversation_sid}:", e)
|
146 |
|
147 |
return None
|
148 |
|