Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,22 +16,42 @@ from pdfminer.layout import LAParams
|
|
16 |
from twilio.base.exceptions import TwilioRestException # Add this at the top
|
17 |
import pdfplumber
|
18 |
import datetime
|
|
|
19 |
|
20 |
APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
|
21 |
|
22 |
os.environ["PYTORCH_JIT"] = "0"
|
|
|
23 |
# --- PDF Extraction ---
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def extract_text_from_pdf(pdf_path):
|
26 |
text_output = StringIO()
|
27 |
-
|
28 |
try:
|
29 |
with pdfplumber.open(pdf_path) as pdf:
|
30 |
for page in pdf.pages:
|
31 |
# Extract tables
|
32 |
-
page_tables = page
|
33 |
if page_tables:
|
34 |
-
|
35 |
# Extract text
|
36 |
text = page.extract_text()
|
37 |
if text:
|
@@ -42,8 +62,7 @@ def extract_text_from_pdf(pdf_path):
|
|
42 |
with open(pdf_path, 'rb') as file:
|
43 |
extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
|
44 |
extracted_text = text_output.getvalue()
|
45 |
-
|
46 |
-
return f"{extracted_text}\n\n{formatted_tables}"
|
47 |
|
48 |
def clean_extracted_text(text):
|
49 |
lines = text.splitlines()
|
@@ -56,12 +75,16 @@ def clean_extracted_text(text):
|
|
56 |
return '\n'.join(cleaned)
|
57 |
|
58 |
def _format_tables_internal(tables):
|
59 |
-
|
|
|
|
|
60 |
for table in tables:
|
61 |
-
#
|
62 |
-
|
63 |
-
|
64 |
-
|
|
|
|
|
65 |
|
66 |
# --- DOCX Extraction ---
|
67 |
def extract_text_from_docx(docx_path):
|
|
|
16 |
from twilio.base.exceptions import TwilioRestException # Add this at the top
|
17 |
import pdfplumber
|
18 |
import datetime
|
19 |
+
import csv
|
20 |
|
21 |
APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
|
22 |
|
23 |
os.environ["PYTORCH_JIT"] = "0"
|
24 |
+
|
25 |
# --- PDF Extraction ---
|
26 |
+
def _extract_tables_from_page(page):
|
27 |
+
"""Extracts tables from a single page of a PDF."""
|
28 |
+
|
29 |
+
tables = page.extract_tables()
|
30 |
+
if not tables:
|
31 |
+
return []
|
32 |
+
|
33 |
+
formatted_tables = []
|
34 |
+
for table in tables:
|
35 |
+
formatted_table = []
|
36 |
+
for row in table:
|
37 |
+
if row: # Filter out empty rows
|
38 |
+
formatted_row = [cell if cell is not None else "" for cell in row] # Replace None with ""
|
39 |
+
formatted_table.append(formatted_row)
|
40 |
+
else:
|
41 |
+
formatted_table.append([""]) # Append an empty row if the row is None
|
42 |
+
formatted_tables.append(formatted_table)
|
43 |
+
return formatted_tables
|
44 |
+
|
45 |
def extract_text_from_pdf(pdf_path):
|
46 |
text_output = StringIO()
|
47 |
+
all_tables = []
|
48 |
try:
|
49 |
with pdfplumber.open(pdf_path) as pdf:
|
50 |
for page in pdf.pages:
|
51 |
# Extract tables
|
52 |
+
page_tables = _extract_tables_from_page(page)
|
53 |
if page_tables:
|
54 |
+
all_tables.extend(page_tables)
|
55 |
# Extract text
|
56 |
text = page.extract_text()
|
57 |
if text:
|
|
|
62 |
with open(pdf_path, 'rb') as file:
|
63 |
extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
|
64 |
extracted_text = text_output.getvalue()
|
65 |
+
return extracted_text, all_tables # Return text and list of tables
|
|
|
66 |
|
67 |
def clean_extracted_text(text):
|
68 |
lines = text.splitlines()
|
|
|
75 |
return '\n'.join(cleaned)
|
76 |
|
77 |
def _format_tables_internal(tables):
|
78 |
+
"""Formats extracted tables into a string representation."""
|
79 |
+
|
80 |
+
formatted_tables_str = []
|
81 |
for table in tables:
|
82 |
+
# Use csv writer to handle commas and quotes correctly
|
83 |
+
with StringIO() as csvfile:
|
84 |
+
csvwriter = csv.writer(csvfile)
|
85 |
+
csvwriter.writerows(table)
|
86 |
+
formatted_tables_str.append(csvfile.getvalue())
|
87 |
+
return "\n\n".join(formatted_tables_str)
|
88 |
|
89 |
# --- DOCX Extraction ---
|
90 |
def extract_text_from_docx(docx_path):
|