Spaces:
Sleeping
Sleeping
File size: 5,311 Bytes
f86ad35 77541b8 f86ad35 77541b8 f86ad35 77541b8 f86ad35 77541b8 f86ad35 77541b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import gradio as gr
import requests
import PyPDF2
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from PIL import Image
import io
import os
from huggingface_hub import HfApi, create_repo
import re
from datetime import datetime
# Initialize Hugging Face API
HF_TOKEN = os.getenv("HF_TOKEN") # Set in Hugging Face Spaces Secrets
REPO_NAME = "pdf-images-extracted" # Hugging Face repo for images
hf_api = HfApi()
def ensure_hf_repo():
"""Create or get Hugging Face repository."""
try:
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
return repo_id
except Exception as e:
return f"Error creating repo: {str(e)}"
def upload_image_to_hf(image, filename):
"""Upload an image to Hugging Face Hub and return its URL."""
repo_id = ensure_hf_repo()
if isinstance(repo_id, str) and repo_id.startswith("Error"):
return repo_id
try:
# Save image temporarily
temp_path = f"/tmp/temp_{filename}.png"
image.save(temp_path, format="PNG")
# Upload to Hugging Face
file_url = hf_api.upload_file(
path_or_fileobj=temp_path,
path_in_repo=f"images/{filename}.png",
repo_id=repo_id,
token=HF_TOKEN
)
os.remove(temp_path)
return file_url
except Exception as e:
return f"Error uploading image: {str(e)}"
def extract_text_from_pdf(pdf_file):
"""Extract text from PDF using PyPDF2."""
try:
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text() or ""
text += page_text + "\n\n"
return text
except Exception as e:
return f"Error extracting text: {str(e)}"
def extract_images_from_pdf(pdf_file):
"""Extract images from PDF and convert to PIL images."""
try:
if isinstance(pdf_file, str): # URL case
response = requests.get(pdf_file)
images = convert_from_bytes(response.content)
else: # File upload case
images = convert_from_path(pdf_file.name)
return images
except Exception as e:
return f"Error extracting images: {str(e)}"
def format_to_markdown(text, images):
"""Convert extracted text and images to Markdown format."""
markdown_output = "# Extracted PDF Content\n\n"
# Clean and format text
text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
lines = text.split("\n")
for line in lines:
# Detect headings (simple heuristic: all caps or specific keywords)
if line.isupper() and len(line) > 5:
markdown_output += f"## {line}\n\n"
# Detect lists (lines starting with numbers or bullets)
elif re.match(r'^\s*[\d\-*+]\.\s+', line):
markdown_output += f"- {line.strip()[2:]}\n"
else:
markdown_output += f"{line}\n\n"
# Add images with Hugging Face URLs
if isinstance(images, list) and images:
markdown_output += "## Extracted Images\n\n"
for i, image in enumerate(images):
ocr_text = pytesseract.image_to_string(image).strip()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"image_{i}_{timestamp}"
image_url = upload_image_to_hf(image, filename)
if not image_url.startswith("Error"):
markdown_output += f"\n"
if ocr_text:
markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
else:
markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
return markdown_output
def process_pdf(pdf_input, pdf_url):
"""Main function to process PDF input (file or URL) and generate Markdown."""
if not HF_TOKEN:
return "Error: HF_TOKEN not set in Spaces Secrets."
if pdf_url and pdf_url.strip():
response = requests.head(pdf_url)
if response.status_code != 200:
return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
pdf_file = pdf_url
elif pdf_input:
pdf_file = pdf_input
else:
return "Error: Please provide a PDF file or URL."
text = extract_text_from_pdf(pdf_file)
images = extract_images_from_pdf(pdf_file)
if isinstance(text, str) and text.startswith("Error"):
return text
if isinstance(images, str) and images.startswith("Error"):
return images
markdown_output = format_to_markdown(text, images)
return markdown_output
# Gradio Interface
iface = gr.Interface(
fn=process_pdf,
inputs=[
gr.File(label="Upload PDF File", type="filepath"),
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
],
outputs=gr.Markdown(label="Markdown Output"),
title="PDF to Markdown Converter",
description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible. Requires HF_TOKEN in Spaces Secrets.",
)
if __name__ == "__main__":
iface.launch() |