File size: 5,311 Bytes
f86ad35
 
 
 
 
 
 
 
 
 
 
 
 
77541b8
 
f86ad35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77541b8
f86ad35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77541b8
 
 
f86ad35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77541b8
f86ad35
 
 
77541b8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import gradio as gr
import requests
import PyPDF2
from pdf2image import convert_from_path, convert_from_bytes
import pytesseract
from PIL import Image
import io
import os
from huggingface_hub import HfApi, create_repo
import re
from datetime import datetime

# Initialize Hugging Face API
HF_TOKEN = os.getenv("HF_TOKEN")  # Set in Hugging Face Spaces Secrets
REPO_NAME = "pdf-images-extracted"  # Hugging Face repo for images
hf_api = HfApi()

def ensure_hf_repo():
    """Create or get Hugging Face repository."""
    try:
        repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, exist_ok=True)
        return repo_id
    except Exception as e:
        return f"Error creating repo: {str(e)}"

def upload_image_to_hf(image, filename):
    """Upload an image to Hugging Face Hub and return its URL."""
    repo_id = ensure_hf_repo()
    if isinstance(repo_id, str) and repo_id.startswith("Error"):
        return repo_id

    try:
        # Save image temporarily
        temp_path = f"/tmp/temp_{filename}.png"
        image.save(temp_path, format="PNG")
        
        # Upload to Hugging Face
        file_url = hf_api.upload_file(
            path_or_fileobj=temp_path,
            path_in_repo=f"images/{filename}.png",
            repo_id=repo_id,
            token=HF_TOKEN
        )
        os.remove(temp_path)
        return file_url
    except Exception as e:
        return f"Error uploading image: {str(e)}"

def extract_text_from_pdf(pdf_file):
    """Extract text from PDF using PyPDF2."""
    try:
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n\n"
        return text
    except Exception as e:
        return f"Error extracting text: {str(e)}"

def extract_images_from_pdf(pdf_file):
    """Extract images from PDF and convert to PIL images."""
    try:
        if isinstance(pdf_file, str):  # URL case
            response = requests.get(pdf_file)
            images = convert_from_bytes(response.content)
        else:  # File upload case
            images = convert_from_path(pdf_file.name)
        return images
    except Exception as e:
        return f"Error extracting images: {str(e)}"

def format_to_markdown(text, images):
    """Convert extracted text and images to Markdown format."""
    markdown_output = "# Extracted PDF Content\n\n"
    
    # Clean and format text
    text = re.sub(r'\n\s*\n', '\n\n', text.strip())  # Remove excessive newlines
    lines = text.split("\n")
    for line in lines:
        # Detect headings (simple heuristic: all caps or specific keywords)
        if line.isupper() and len(line) > 5:
            markdown_output += f"## {line}\n\n"
        # Detect lists (lines starting with numbers or bullets)
        elif re.match(r'^\s*[\d\-*+]\.\s+', line):
            markdown_output += f"- {line.strip()[2:]}\n"
        else:
            markdown_output += f"{line}\n\n"
    
    # Add images with Hugging Face URLs
    if isinstance(images, list) and images:
        markdown_output += "## Extracted Images\n\n"
        for i, image in enumerate(images):
            ocr_text = pytesseract.image_to_string(image).strip()
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"image_{i}_{timestamp}"
            image_url = upload_image_to_hf(image, filename)
            
            if not image_url.startswith("Error"):
                markdown_output += f"![Image {i+1}]({image_url})\n"
                if ocr_text:
                    markdown_output += f"**Image {i+1} OCR Text:**\n```\n{ocr_text}\n```\n\n"
            else:
                markdown_output += f"**Image {i+1} Error:** {image_url}\n\n"
    
    return markdown_output

def process_pdf(pdf_input, pdf_url):
    """Main function to process PDF input (file or URL) and generate Markdown."""
    if not HF_TOKEN:
        return "Error: HF_TOKEN not set in Spaces Secrets."

    if pdf_url and pdf_url.strip():
        response = requests.head(pdf_url)
        if response.status_code != 200:
            return f"Error: Invalid URL or inaccessible PDF: {pdf_url}"
        pdf_file = pdf_url
    elif pdf_input:
        pdf_file = pdf_input
    else:
        return "Error: Please provide a PDF file or URL."

    text = extract_text_from_pdf(pdf_file)
    images = extract_images_from_pdf(pdf_file)

    if isinstance(text, str) and text.startswith("Error"):
        return text
    if isinstance(images, str) and images.startswith("Error"):
        return images

    markdown_output = format_to_markdown(text, images)
    return markdown_output

# Gradio Interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=[
        gr.File(label="Upload PDF File", type="filepath"),
        gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
    ],
    outputs=gr.Markdown(label="Markdown Output"),
    title="PDF to Markdown Converter",
    description="Upload a PDF file or provide a PDF URL to convert it into a Markdown document. Images and charts are extracted, uploaded to Hugging Face Hub, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved as much as possible. Requires HF_TOKEN in Spaces Secrets.",
)

if __name__ == "__main__":
    iface.launch()