Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

1f2e0af

verified ·

1 Parent(s): 875f540

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -6

app.py CHANGED Viewed

@@ -73,15 +73,20 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
             if output_format == "JSON":
                 json_data = {
                     "text": text,
-                    "tables": [table.to_dict(orient='records') for table in tables],  # Use 'records' for better handling of duplicate columns
                     "images": images
                 }
                 download_data = json.dumps(json_data, indent=4)  # Add indentation for readability
             elif output_format == "Markdown":
                 markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                 for i, table in enumerate(tables):
-                    markdown_text += f"## Table {i+1}\n"
-                    markdown_text += table.to_markdown(index=False) + "\n\n"
                 # Image embedding in Markdown (using relative paths)
                 markdown_text += "\n\n# Images\n\n"
@@ -93,8 +98,9 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
             elif output_format == "HTML":
                 html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                 for i, table in enumerate(tables):
-                    html_text += f"<h2>Table {i+1}</h2>\n"
-                    html_text += table.to_html() + "<br>"
                 # Image embedding in HTML (using relative paths)
                 html_text += "\n\n<h2>Images</h2>\n\n"
@@ -121,4 +127,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch(share=False)

             if output_format == "JSON":
                 json_data = {
                     "text": text,
+                    "tables": [
+                        table.to_dict(orient='records')
+                        for table in tables
+                        if not table.columns.duplicated().any()
+                    ],  # Use 'records' for better handling of duplicate columns
                     "images": images
                 }
                 download_data = json.dumps(json_data, indent=4)  # Add indentation for readability
             elif output_format == "Markdown":
                 markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
                 for i, table in enumerate(tables):
+                    if not table.columns.duplicated().any():  # Check for duplicate columns
+                        markdown_text += f"## Table {i+1}\n"
+                        markdown_text += table.to_markdown(index=False) + "\n\n"
                 # Image embedding in Markdown (using relative paths)
                 markdown_text += "\n\n# Images\n\n"
             elif output_format == "HTML":
                 html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
                 for i, table in enumerate(tables):
+                    if not table.columns.duplicated().any():  # Check for duplicate columns
+                        html_text += f"<h2>Table {i+1}</h2>\n"
+                        html_text += table.to_html() + "<br>"
                 # Image embedding in HTML (using relative paths)
                 html_text += "\n\n<h2>Images</h2>\n\n"
 )
 if __name__ == "__main__":
+    iface.launch(share=True)  # Set share=True to create a public link