Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

6a30f2e

verified ·

1 Parent(s): 7cb3598

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -4

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from PIL import Image
 import pandas as pd
 import pdfplumber
 import tempfile
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
     """
@@ -65,14 +66,14 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
-            with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
                         "text": text,
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
-                    json.dump(json_data, tmp, indent=4)
                 elif output_format == "Markdown":
                     markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
@@ -83,7 +84,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         markdown_text += f'![Image]({image_path})\n'
-                    tmp.write(markdown_text.encode('utf-8'))
                 elif output_format == "HTML":
                     html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
@@ -94,11 +95,13 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         html_text += f'<img src="{image_path}" alt="Image"><br>\n'
-                    tmp.write(html_text.encode('utf-8'))
                 download_path = tmp.name
             return text, download_path
     except Exception as main_e:
         print(f"A main error occurred: {main_e}")
         return "", None

 import pandas as pd
 import pdfplumber
 import tempfile
+import traceback
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
     """
                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
+            with tempfile.NamedTemporaryFile(mode="wb", delete=False, suffix="." + output_format.lower()) as tmp:
                 if output_format == "JSON":
                     json_data = {
                         "text": text,
                         "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
                         "images": images
                     }
+                    json.dump(json_data, tmp, indent=4)
                 elif output_format == "Markdown":
                     markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                     for i, table in enumerate(tables):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         markdown_text += f'![Image]({image_path})\n'
+                    tmp.write(markdown_text.encode('utf-8'))
                 elif output_format == "HTML":
                     html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                     for i, table in enumerate(tables):
                     for image in images:
                         image_path = os.path.join(os.getcwd(), image["filename"])
                         html_text += f'<img src="{image_path}" alt="Image"><br>\n'
+                    tmp.write(html_text.encode('utf-8'))
                 download_path = tmp.name
             return text, download_path
     except Exception as main_e:
+        traceback.print_exc()  # Print full traceback to console
         print(f"A main error occurred: {main_e}")
         return "", None