Spaces:

sblumenf
/

pdf-convert

Sleeping

App Files Files Community

sblumenf commited on Dec 12, 2024

Commit

3403d47

verified ·

1 Parent(s): 1f2e0af

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -61

app.py CHANGED Viewed

@@ -7,29 +7,16 @@ import io
 from PIL import Image
 import pandas as pd
 import pdfplumber
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
-    """
-    Parses a PDF file, extracts text, tables, and images, and formats the output.
-    Args:
-        pdf_file: Path to the uploaded PDF file.
-        output_format: Desired output format ("JSON", "Markdown", or "HTML").
-        progress: Gradio Progress object for displaying progress.
-    Returns:
-        tuple: Extracted text and download data in the specified format.
-            Returns an empty string and None if there is an error.
-    """
     try:
         with open(pdf_file, 'rb') as file:
             text = ""
             tables = []
             images = []
-            # Iterate directly over pages
             for page in extract_pages(file):
-                # progress(i / len(pages))  # Update progress bar (if you still want to use a progress bar, you'll need to determine the total number of pages beforehand)
                 for element in page:
                     if isinstance(element, LTTextBoxHorizontal):
                         text += element.get_text()
@@ -52,64 +39,56 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
                         except Exception as e:
                             print(f"Error extracting image: {e}")
-            # Enhanced table extraction using pdfplumber
             with pdfplumber.open(pdf_file) as pdf:
                 for page_num, page in enumerate(pdf.pages):
                     for table in page.extract_tables():
-                        # Handle potential duplicate columns
                         if len(table) > 0 and len(set(table[0])) != len(table[0]):
-                            # If duplicate columns exist, try to create unique column names
                             unique_columns = []
                             for col in table[0]:
                                 if col in unique_columns:
-                                    col = f"{col}_{unique_columns.count(col)}"  # Append a counter
                                 unique_columns.append(col)
                             df = pd.DataFrame(table[1:], columns=unique_columns)
                         else:
                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
-            # Format extracted data based on user selection
-            if output_format == "JSON":
-                json_data = {
-                    "text": text,
-                    "tables": [
-                        table.to_dict(orient='records')
-                        for table in tables
-                        if not table.columns.duplicated().any()
-                    ],  # Use 'records' for better handling of duplicate columns
-                    "images": images
-                }
-                download_data = json.dumps(json_data, indent=4)  # Add indentation for readability
-            elif output_format == "Markdown":
-                markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
-                for i, table in enumerate(tables):
-                    if not table.columns.duplicated().any():  # Check for duplicate columns
-                        markdown_text += f"## Table {i+1}\n"
-                        markdown_text += table.to_markdown(index=False) + "\n\n"
-                # Image embedding in Markdown (using relative paths)
-                markdown_text += "\n\n# Images\n\n"
-                for image in images:
-                    image_path = os.path.join(os.getcwd(), image["filename"])
-                    markdown_text += f'![Image]({image_path})\n'
-                download_data = markdown_text
-            elif output_format == "HTML":
-                html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
-                for i, table in enumerate(tables):
-                    if not table.columns.duplicated().any():  # Check for duplicate columns
-                        html_text += f"<h2>Table {i+1}</h2>\n"
-                        html_text += table.to_html() + "<br>"
-                # Image embedding in HTML (using relative paths)
-                html_text += "\n\n<h2>Images</h2>\n\n"
-                for image in images:
-                    image_path = os.path.join(os.getcwd(), image["filename"])
-                    html_text += f'<img src="{image_path}" alt="Image"><br>\n'
-                download_data = html_text.encode("utf-8")  # Encode for HTML download
-            return text, download_data
     except Exception as main_e:
         print(f"A main error occurred: {main_e}")
@@ -117,7 +96,7 @@ def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
 iface = gr.Interface(
     fn=parse_pdf,
-    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],  # Remove gr.Progress() from inputs
     outputs=[
         gr.Text(label="Output Text"),
         gr.File(label="Download Output")
@@ -127,4 +106,4 @@ iface = gr.Interface(
 )
 if __name__ == "__main__":
-    iface.launch(share=True)  # Set share=True to create a public link

 from PIL import Image
 import pandas as pd
 import pdfplumber
+import tempfile  # Import tempfile
 def parse_pdf(pdf_file, output_format, progress=gr.Progress()):
     try:
         with open(pdf_file, 'rb') as file:
             text = ""
             tables = []
             images = []
             for page in extract_pages(file):
                 for element in page:
                     if isinstance(element, LTTextBoxHorizontal):
                         text += element.get_text()
                         except Exception as e:
                             print(f"Error extracting image: {e}")
             with pdfplumber.open(pdf_file) as pdf:
                 for page_num, page in enumerate(pdf.pages):
                     for table in page.extract_tables():
                         if len(table) > 0 and len(set(table[0])) != len(table[0]):
                             unique_columns = []
                             for col in table[0]:
                                 if col in unique_columns:
+                                    col = f"{col}_{unique_columns.count(col)}"
                                 unique_columns.append(col)
                             df = pd.DataFrame(table[1:], columns=unique_columns)
                         else:
                             df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                         tables.append(df)
+            # Use a temporary file for the download
+            with tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix="." + output_format.lower()) as tmp:
+                if output_format == "JSON":
+                    json_data = {
+                        "text": text,
+                        "tables": [table.to_dict(orient='records') for table in tables if not table.columns.duplicated().any()],
+                        "images": images
+                    }
+                    json.dump(json_data, tmp, indent=4)
+                    download_path = tmp.name
+                elif output_format == "Markdown":
+                    markdown_text = f"# Extracted Text\n\n{text}\n\n# Tables\n"
+                    for i, table in enumerate(tables):
+                        if not table.columns.duplicated().any():
+                            markdown_text += f"## Table {i+1}\n"
+                            markdown_text += table.to_markdown(index=False) + "\n\n"
+                    markdown_text += "\n\n# Images\n\n"
+                    for image in images:
+                        image_path = os.path.join(os.getcwd(), image["filename"])
+                        markdown_text += f'![Image]({image_path})\n'
+                    tmp.write(markdown_text.encode('utf-8'))
+                    download_path = tmp.name
+                elif output_format == "HTML":
+                    html_text = f"<p>{text}</p>\n\n<h2>Tables</h2>\n"
+                    for i, table in enumerate(tables):
+                        if not table.columns.duplicated().any():
+                            html_text += f"<h2>Table {i+1}</h2>\n"
+                            html_text += table.to_html() + "<br>"
+                    html_text += "\n\n<h2>Images</h2>\n\n"
+                    for image in images:
+                        image_path = os.path.join(os.getcwd(), image["filename"])
+                        html_text += f'<img src="{image_path}" alt="Image"><br>\n'
+                    tmp.write(html_text.encode('utf-8'))
+                    download_path = tmp.name
+            return text, download_path
     except Exception as main_e:
         print(f"A main error occurred: {main_e}")
 iface = gr.Interface(
     fn=parse_pdf,
+    inputs=["file", gr.Dropdown(["JSON", "Markdown", "HTML"])],
     outputs=[
         gr.Text(label="Output Text"),
         gr.File(label="Download Output")
 )
 if __name__ == "__main__":
+    iface.launch(share=True)