Spaces:

broadfield-dev
/

pdf2markdown

Sleeping

App Files Files Community

broadfield-dev commited on Jun 2

Commit

9db742a

verified ·

1 Parent(s): aec5733

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -17

app.py CHANGED Viewed

@@ -35,6 +35,9 @@ def check_poppler():
 def ensure_hf_dataset():
     """Create or get Hugging Face dataset repository."""
     try:
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
         return repo_id
@@ -42,8 +45,9 @@ def ensure_hf_dataset():
         logger.error(f"Failed to create/access dataset repo: {str(e)}")
         return f"Error: Failed to create/access dataset repo: {str(e)}"
-def upload_image_to_hf(image, filename):
     """Upload an image to Hugging Face dataset and return its URL."""
     repo_id = ensure_hf_dataset()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
@@ -54,6 +58,7 @@ def upload_image_to_hf(image, filename):
         image.save(temp_path, format="PNG")
         # Upload to Hugging Face dataset
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
@@ -81,7 +86,7 @@ def extract_text_from_pdf(pdf_input, status_callback):
         with pdfplumber.open(pdf_file) as pdf:
             text = ""
             for page in pdf.pages:
-                page_text = page.extract_text() or ""
                 text += page_text + "\n\n"
                 tables = page.extract_tables()
                 for table in tables:
@@ -93,18 +98,20 @@ def extract_text_from_pdf(pdf_input, status_callback):
 def extract_images_from_pdf(pdf_input, status_callback):
     """Extract images from PDF and convert to PIL images."""
-    status_callback("Extracting images from PDF...")
     if not check_poppler():
         return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
     try:
         if isinstance(pdf_input, str):  # URL case
             logger.info(f"Downloading PDF from URL: {pdf_input}")
             response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
         else:  # File upload case
             logger.info(f"Processing uploaded PDF: {pdf_input.name}")
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
@@ -117,7 +124,7 @@ def format_to_markdown(text, images, status_callback):
     markdown_output = "# Extracted PDF Content\n\n"
     # Clean and format text
-    text = re.sub(r'\n\s*\n', '\n\n', text.strip())  # Remove excessive newlines
     lines = text.split("\n")
     for line in lines:
         # Detect headings (heuristic: all caps or specific keywords)
@@ -133,11 +140,10 @@ def format_to_markdown(text, images, status_callback):
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
-            status_callback(f"Uploading image {i+1}...")
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
-            image_url = upload_image_to_hf(image, filename)
             if not image_url.startswith("Error"):
                 markdown_output += f"![Image {i+1}]({image_url})\n"
@@ -148,14 +154,20 @@ def format_to_markdown(text, images, status_callback):
     return markdown_output
-def process_pdf(pdf_input, pdf_url, status_callback):
     """Main function to process PDF input (file or URL) and generate Markdown."""
     logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
     status_callback("Starting PDF processing...")
     if not HF_TOKEN:
         status_callback("Error: HF_TOKEN not set.")
-        return "Error: HF_TOKEN not set in Spaces Secrets.", ""
     # Log poppler status
     logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
@@ -172,24 +184,24 @@ def process_pdf(pdf_input, pdf_url, status_callback):
         except requests.RequestException as e:
             logger.error(f"Error accessing URL: {str(e)}")
             status_callback(f"Error accessing URL: {str(e)}")
-            return f"Error accessing URL: {str(e)}", ""
     elif not pdf_input:
         status_callback("Error: No PDF provided.")
-        return "Error: Please provide a PDF file or URL.", ""
     text = extract_text_from_pdf(pdf_input, status_callback)
     images = extract_images_from_pdf(pdf_input, status_callback)
     if isinstance(text, str) and text.startswith("Error"):
         status_callback("Text extraction failed.")
-        return text, ""
     if isinstance(images, str) and images.startswith("Error"):
         status_callback("Image extraction failed.")
-        return images, ""
     markdown_output = format_to_markdown(text, images, status_callback)
     status_callback("Processing complete.")
-    return markdown_output, ""
 # Gradio Interface
 with gr.Blocks() as iface:
@@ -205,12 +217,9 @@ with gr.Blocks() as iface:
     submit_btn = gr.Button("Process PDF")
-    def update_status(message):
-        return message
     submit_btn.click(
         fn=process_pdf,
-        inputs=[pdf_input, pdf_url, update_status],
         outputs=[output, status]
     )

 def ensure_hf_dataset():
     """Create or get Hugging Face dataset repository."""
     try:
+        # Verify token
+        if not HF_TOKEN:
+            raise ValueError("HF_TOKEN is not set")
         repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
         logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
         return repo_id
         logger.error(f"Failed to create/access dataset repo: {str(e)}")
         return f"Error: Failed to create/access dataset repo: {str(e)}"
+def upload_image_to_hf(image, filename, status_callback):
     """Upload an image to Hugging Face dataset and return its URL."""
+    status_callback("Checking dataset access...")
     repo_id = ensure_hf_dataset()
     if isinstance(repo_id, str) and repo_id.startswith("Error"):
         return repo_id
         image.save(temp_path, format="PNG")
         # Upload to Hugging Face dataset
+        status_callback(f"Uploading image {filename}...")
         file_url = hf_api.upload_file(
             path_or_fileobj=temp_path,
             path_in_repo=f"images/{filename}.png",
         with pdfplumber.open(pdf_file) as pdf:
             text = ""
             for page in pdf.pages:
+                page_text = page.extract_text(layout=True) or ""
                 text += page_text + "\n\n"
                 tables = page.extract_tables()
                 for table in tables:
 def extract_images_from_pdf(pdf_input, status_callback):
     """Extract images from PDF and convert to PIL images."""
+    status_callback("Checking poppler-utils...")
     if not check_poppler():
         return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
     try:
         if isinstance(pdf_input, str):  # URL case
             logger.info(f"Downloading PDF from URL: {pdf_input}")
+            status_callback("Downloading PDF for image extraction...")
             response = requests.get(pdf_input, stream=True)
             response.raise_for_status()
             images = convert_from_bytes(response.content)
         else:  # File upload case
             logger.info(f"Processing uploaded PDF: {pdf_input.name}")
+            status_callback("Extracting images from uploaded PDF...")
             images = convert_from_path(pdf_input.name)
         return images
     except Exception as e:
     markdown_output = "# Extracted PDF Content\n\n"
     # Clean and format text
+    text = re.sub(r'\n\s*\n+', '\n\n', text.strip())  # Normalize newlines
     lines = text.split("\n")
     for line in lines:
         # Detect headings (heuristic: all caps or specific keywords)
     if isinstance(images, list) and images:
         markdown_output += "## Extracted Images\n\n"
         for i, image in enumerate(images):
             ocr_text = pytesseract.image_to_string(image).strip()
             timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             filename = f"image_{i}_{timestamp}"
+            image_url = upload_image_to_hf(image, filename, status_callback)
             if not image_url.startswith("Error"):
                 markdown_output += f"![Image {i+1}]({image_url})\n"
     return markdown_output
+def process_pdf(pdf_input, pdf_url, status):
     """Main function to process PDF input (file or URL) and generate Markdown."""
     logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
+    def status_callback(message):
+        nonlocal status
+        status = message
+        return status
     status_callback("Starting PDF processing...")
     if not HF_TOKEN:
         status_callback("Error: HF_TOKEN not set.")
+        return "Error: HF_TOKEN not set in Spaces Secrets.", status
     # Log poppler status
     logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
         except requests.RequestException as e:
             logger.error(f"Error accessing URL: {str(e)}")
             status_callback(f"Error accessing URL: {str(e)}")
+            return f"Error accessing URL: {str(e)}", status
     elif not pdf_input:
         status_callback("Error: No PDF provided.")
+        return "Error: Please provide a PDF file or URL.", status
     text = extract_text_from_pdf(pdf_input, status_callback)
     images = extract_images_from_pdf(pdf_input, status_callback)
     if isinstance(text, str) and text.startswith("Error"):
         status_callback("Text extraction failed.")
+        return text, status
     if isinstance(images, str) and images.startswith("Error"):
         status_callback("Image extraction failed.")
+        return images, status
     markdown_output = format_to_markdown(text, images, status_callback)
     status_callback("Processing complete.")
+    return markdown_output, status
 # Gradio Interface
 with gr.Blocks() as iface:
     submit_btn = gr.Button("Process PDF")
     submit_btn.click(
         fn=process_pdf,
+        inputs=[pdf_input, pdf_url, status],
         outputs=[output, status]
     )