Spaces:

masadonline
/

Quasa

Sleeping

App Files Files Community

masadonline commited on May 17

Commit

de2271c

verified ·

1 Parent(s): ea00c43

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -11

app.py CHANGED Viewed

@@ -16,22 +16,42 @@ from pdfminer.layout import LAParams
 from twilio.base.exceptions import TwilioRestException  # Add this at the top
 import pdfplumber
 import datetime
 APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
 os.environ["PYTORCH_JIT"] = "0"
 # --- PDF Extraction ---
-# --- PDF Extraction ---
 def extract_text_from_pdf(pdf_path):
     text_output = StringIO()
-    tables = []
     try:
         with pdfplumber.open(pdf_path) as pdf:
             for page in pdf.pages:
                 # Extract tables
-                page_tables = page.extract_tables()
                 if page_tables:
-                    tables.extend(page_tables)
                 # Extract text
                 text = page.extract_text()
                 if text:
@@ -42,8 +62,7 @@ def extract_text_from_pdf(pdf_path):
         with open(pdf_path, 'rb') as file:
             extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
     extracted_text = text_output.getvalue()
-    formatted_tables = _format_tables_internal(tables)
-    return f"{extracted_text}\n\n{formatted_tables}"
 def clean_extracted_text(text):
     lines = text.splitlines()
@@ -56,12 +75,16 @@ def clean_extracted_text(text):
     return '\n'.join(cleaned)
 def _format_tables_internal(tables):
-    formatted_tables = []
     for table in tables:
-        # Basic formatting: joining rows with '|' and cells with ','
-        formatted_table = "\n".join(["|".join(row) for row in table])
-        formatted_tables.append(f"<table data>\n{formatted_table}\n</table>")
-    return "\n\n".join(formatted_tables)
 # --- DOCX Extraction ---
 def extract_text_from_docx(docx_path):

 from twilio.base.exceptions import TwilioRestException  # Add this at the top
 import pdfplumber
 import datetime
+import csv
 APP_START_TIME = datetime.datetime.now(datetime.timezone.utc)
 os.environ["PYTORCH_JIT"] = "0"
 # --- PDF Extraction ---
+def _extract_tables_from_page(page):
+    """Extracts tables from a single page of a PDF."""
+    tables = page.extract_tables()
+    if not tables:
+        return []
+    formatted_tables = []
+    for table in tables:
+        formatted_table = []
+        for row in table:
+            if row:  # Filter out empty rows
+                formatted_row = [cell if cell is not None else "" for cell in row]  # Replace None with ""
+                formatted_table.append(formatted_row)
+            else:
+                formatted_table.append([""])  # Append an empty row if the row is None
+        formatted_tables.append(formatted_table)
+    return formatted_tables
 def extract_text_from_pdf(pdf_path):
     text_output = StringIO()
+    all_tables = []
     try:
         with pdfplumber.open(pdf_path) as pdf:
             for page in pdf.pages:
                 # Extract tables
+                page_tables = _extract_tables_from_page(page)
                 if page_tables:
+                    all_tables.extend(page_tables)
                 # Extract text
                 text = page.extract_text()
                 if text:
         with open(pdf_path, 'rb') as file:
             extract_text_to_fp(file, text_output, laparams=LAParams(), output_type='text', codec=None)
     extracted_text = text_output.getvalue()
+    return extracted_text, all_tables  # Return text and list of tables
 def clean_extracted_text(text):
     lines = text.splitlines()
     return '\n'.join(cleaned)
 def _format_tables_internal(tables):
+    """Formats extracted tables into a string representation."""
+    formatted_tables_str = []
     for table in tables:
+        # Use csv writer to handle commas and quotes correctly
+        with StringIO() as csvfile:
+            csvwriter = csv.writer(csvfile)
+            csvwriter.writerows(table)
+            formatted_tables_str.append(csvfile.getvalue())
+    return "\n\n".join(formatted_tables_str)
 # --- DOCX Extraction ---
 def extract_text_from_docx(docx_path):