Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

a492eda

verified ·

1 Parent(s): 9db742a

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -56

app.py CHANGED Viewed

@@ -35,7 +35,6 @@ def check_poppler():
 def ensure_hf_dataset():
     """Create or get Hugging Face dataset repository."""
     try:
-        # Verify token
         if not HF_TOKEN:
             raise ValueError("HF_TOKEN is not set")
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
@@ -45,9 +44,8 @@ def ensure_hf_dataset():
         logger.error(f"Failed to create/access dataset repo: {str(e)}")
         return f"Error: Failed to create/access dataset repo: {str(e)}"
-def upload_image_to_hf(image, filename, status_callback):
     """Upload an image to Hugging Face dataset and return its URL."""
-    status_callback("Checking dataset access...")
     repo_id = ensure_hf_dataset()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
@@ -58,7 +56,6 @@ def upload_image_to_hf(image, filename, status_callback):
         image.save(temp_path, format="PNG")
         # Upload to Hugging Face dataset
-        status_callback(f"Uploading image {filename}...")
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
@@ -73,9 +70,8 @@ def upload_image_to_hf(image, filename, status_callback):
         logger.error(f"Error uploading image: {str(e)}")
         return f"Error uploading image: {str(e)}"
-def extract_text_from_pdf(pdf_input, status_callback):
     """Extract text from PDF using pdfplumber."""
-    status_callback("Extracting text from PDF...")
     try:
         if isinstance(pdf_input, str):  # URL case
             response = requests.get(pdf_input, stream=True)
@@ -96,31 +92,27 @@ def extract_text_from_pdf(pdf_input, status_callback):
         logger.error(f"Error extracting text: {str(e)}")
         return f"Error extracting text: {str(e)}"
-def extract_images_from_pdf(pdf_input, status_callback):
     """Extract images from PDF and convert to PIL images."""
-    status_callback("Checking poppler-utils...")
     if not check_poppler():
         return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
     try:
         if isinstance(pdf_input, str):  # URL case
             logger.info(f"Downloading PDF from URL: {pdf_input}")
-            status_callback("Downloading PDF for image extraction...")
             response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
         else:  # File upload case
             logger.info(f"Processing uploaded PDF: {pdf_input.name}")
-            status_callback("Extracting images from uploaded PDF...")
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
         logger.error(f"Error extracting images: {str(e)}")
         return f"Error extracting images: {str(e)}"
-def format_to_markdown(text, images, status_callback):
     """Convert extracted text and images to Markdown format."""
-    status_callback("Formatting output as Markdown...")
     markdown_output = "# Extracted PDF Content\n\n"
     # Clean and format text
@@ -143,7 +135,7 @@ def format_to_markdown(text, images, status_callback):
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
-            image_url = upload_image_to_hf(image, filename, status_callback)
             if not image_url.startswith("Error"):
                 markdown_output += f"![Image {i+1}]({image_url})\n"
@@ -154,75 +146,72 @@ def format_to_markdown(text, images, status_callback):
     return markdown_output
-def process_pdf(pdf_input, pdf_url, status):
     """Main function to process PDF input (file or URL) and generate Markdown."""
     logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
-    def status_callback(message):
-        nonlocal status
-        status = message
-        return status
-    status_callback("Starting PDF processing...")
     if not HF_TOKEN:
-        status_callback("Error: HF_TOKEN not set.")
-        return "Error: HF_TOKEN not set in Spaces Secrets.", status
     # Log poppler status
     logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
     # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
         logger.info(f"Decoded URL: {pdf_url}")
-        status_callback(f"Downloading PDF from URL: {pdf_url}")
         try:
             response = requests.head(pdf_url, allow_redirects=True)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
             logger.error(f"Error accessing URL: {str(e)}")
-            status_callback(f"Error accessing URL: {str(e)}")
-            return f"Error accessing URL: {str(e)}", status
     elif not pdf_input:
-        status_callback("Error: No PDF provided.")
-        return "Error: Please provide a PDF file or URL.", status
-    text = extract_text_from_pdf(pdf_input, status_callback)
-    images = extract_images_from_pdf(pdf_input, status_callback)
     if isinstance(text, str) and text.startswith("Error"):
-        status_callback("Text extraction failed.")
-        return text, status
     if isinstance(images, str) and images.startswith("Error"):
-        status_callback("Image extraction failed.")
-        return images, status
-    markdown_output = format_to_markdown(text, images, status_callback)
-    status_callback("Processing complete.")
-    return markdown_output, status
 # Gradio Interface
-with gr.Blocks() as iface:
-    gr.Markdown("# PDF to Markdown Converter")
-    gr.Markdown("Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.")
-    with gr.Row():
-        pdf_input = gr.File(label="Upload PDF File", type="filepath")
-        pdf_url = gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF")
-    status = gr.Textbox(label="Processing Status", interactive=False)
-    output = gr.Markdown(label="Markdown Output")
-    submit_btn = gr.Button("Process PDF")
-    submit_btn.click(
-        fn=process_pdf,
-        inputs=[pdf_input, pdf_url, status],
-        outputs=[output, status]
-    )
 if __name__ == "__main__":
-    # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server
-    iface.launch(share=False)

 def ensure_hf_dataset():
     """Create or get Hugging Face dataset repository."""
     try:
         if not HF_TOKEN:
             raise ValueError("HF_TOKEN is not set")
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         logger.error(f"Failed to create/access dataset repo: {str(e)}")
         return f"Error: Failed to create/access dataset repo: {str(e)}"
+def upload_image_to_hf(image, filename):
     """Upload an image to Hugging Face dataset and return its URL."""
     repo_id = ensure_hf_dataset()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
         image.save(temp_path, format="PNG")
         # Upload to Hugging Face dataset
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
         logger.error(f"Error uploading image: {str(e)}")
         return f"Error uploading image: {str(e)}"
+def extract_text_from_pdf(pdf_input):
     """Extract text from PDF using pdfplumber."""
     try:
         if isinstance(pdf_input, str):  # URL case
             response = requests.get(pdf_input, stream=True)
         logger.error(f"Error extracting text: {str(e)}")
         return f"Error extracting text: {str(e)}"
+def extract_images_from_pdf(pdf_input):
     """Extract images from PDF and convert to PIL images."""
     if not check_poppler():
         return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
     try:
         if isinstance(pdf_input, str):  # URL case
             logger.info(f"Downloading PDF from URL: {pdf_input}")
             response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
         else:  # File upload case
             logger.info(f"Processing uploaded PDF: {pdf_input.name}")
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
         logger.error(f"Error extracting images: {str(e)}")
         return f"Error extracting images: {str(e)}"
+def format_to_markdown(text, images):
     """Convert extracted text and images to Markdown format."""
     markdown_output = "# Extracted PDF Content\n\n"
     # Clean and format text
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
+            image_url = upload_image_to_hf(image, filename)
             if not image_url.startswith("Error"):
                 markdown_output += f"![Image {i+1}]({image_url})\n"
     return markdown_output
+def process_pdf(pdf_input, pdf_url):
     """Main function to process PDF input (file or URL) and generate Markdown."""
+    status = ["Starting PDF processing..."]
     logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
+    def update_status(message):
+        status[0] = message
+        return status[0]
     if not HF_TOKEN:
+        update_status("Error: HF_TOKEN not set.")
+        return "Error: HF_TOKEN not set in Spaces Secrets.", status[0]
     # Log poppler status
     logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
+    update_status("Checking poppler-utils...")
     # Decode URL-encoded string if provided
     if pdf_url and pdf_url.strip():
         pdf_url = urllib.parse.unquote(pdf_url)
         logger.info(f"Decoded URL: {pdf_url}")
+        update_status(f"Downloading PDF from URL: {pdf_url}")
         try:
             response = requests.head(pdf_url, allow_redirects=True)
             response.raise_for_status()
             pdf_input = pdf_url
         except requests.RequestException as e:
             logger.error(f"Error accessing URL: {str(e)}")
+            update_status(f"Error accessing URL: {str(e)}")
+            return f"Error accessing URL: {str(e)}", status[0]
     elif not pdf_input:
+        update_status("Error: No PDF provided.")
+        return "Error: Please provide a PDF file or URL.", status[0]
+    update_status("Extracting text from PDF...")
+    text = extract_text_from_pdf(pdf_input)
+    update_status("Extracting images from PDF...")
+    images = extract_images_from_pdf(pdf_input)
     if isinstance(text, str) and text.startswith("Error"):
+        update_status("Text extraction failed.")
+        return text, status[0]
     if isinstance(images, str) and images.startswith("Error"):
+        update_status("Image extraction failed.")
+        return images, status[0]
+    update_status("Formatting output as Markdown...")
+    markdown_output = format_to_markdown(text, images)
+    update_status("Processing complete.")
+    return markdown_output, status[0]
 # Gradio Interface
+iface = gr.Interface(
+    fn=process_pdf,
+    inputs=[
+        gr.File(label="Upload PDF File", type="filepath"),
+        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
+    ],
+    outputs=[
+        gr.Markdown(label="Markdown Output"),
+        gr.Textbox(label="Processing Status", interactive=False),
+    ],
+    title="PDF to Markdown Converter",
+    description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
+    allow_flagging="never"
+)
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)