Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -35,7 +35,6 @@ def check_poppler():
|
|
35 |
def ensure_hf_dataset():
|
36 |
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
38 |
-
# Verify token
|
39 |
if not HF_TOKEN:
|
40 |
raise ValueError("HF_TOKEN is not set")
|
41 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
@@ -45,9 +44,8 @@ def ensure_hf_dataset():
|
|
45 |
logger.error(f"Failed to create/access dataset repo: {str(e)}")
|
46 |
return f"Error: Failed to create/access dataset repo: {str(e)}"
|
47 |
|
48 |
-
def upload_image_to_hf(image, filename
|
49 |
"""Upload an image to Hugging Face dataset and return its URL."""
|
50 |
-
status_callback("Checking dataset access...")
|
51 |
repo_id = ensure_hf_dataset()
|
52 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
53 |
return repo_id
|
@@ -58,7 +56,6 @@ def upload_image_to_hf(image, filename, status_callback):
|
|
58 |
image.save(temp_path, format="PNG")
|
59 |
|
60 |
# Upload to Hugging Face dataset
|
61 |
-
status_callback(f"Uploading image {filename}...")
|
62 |
file_url = hf_api.upload_file(
|
63 |
path_or_fileobj=temp_path,
|
64 |
path_in_repo=f"images/{filename}.png",
|
@@ -73,9 +70,8 @@ def upload_image_to_hf(image, filename, status_callback):
|
|
73 |
logger.error(f"Error uploading image: {str(e)}")
|
74 |
return f"Error uploading image: {str(e)}"
|
75 |
|
76 |
-
def extract_text_from_pdf(pdf_input
|
77 |
"""Extract text from PDF using pdfplumber."""
|
78 |
-
status_callback("Extracting text from PDF...")
|
79 |
try:
|
80 |
if isinstance(pdf_input, str): # URL case
|
81 |
response = requests.get(pdf_input, stream=True)
|
@@ -96,31 +92,27 @@ def extract_text_from_pdf(pdf_input, status_callback):
|
|
96 |
logger.error(f"Error extracting text: {str(e)}")
|
97 |
return f"Error extracting text: {str(e)}"
|
98 |
|
99 |
-
def extract_images_from_pdf(pdf_input
|
100 |
"""Extract images from PDF and convert to PIL images."""
|
101 |
-
status_callback("Checking poppler-utils...")
|
102 |
if not check_poppler():
|
103 |
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
|
104 |
|
105 |
try:
|
106 |
if isinstance(pdf_input, str): # URL case
|
107 |
logger.info(f"Downloading PDF from URL: {pdf_input}")
|
108 |
-
status_callback("Downloading PDF for image extraction...")
|
109 |
response = requests.get(pdf_input, stream=True)
|
110 |
response.raise_for_status()
|
111 |
images = convert_from_bytes(response.content)
|
112 |
else: # File upload case
|
113 |
logger.info(f"Processing uploaded PDF: {pdf_input.name}")
|
114 |
-
status_callback("Extracting images from uploaded PDF...")
|
115 |
images = convert_from_path(pdf_input.name)
|
116 |
return images
|
117 |
except Exception as e:
|
118 |
logger.error(f"Error extracting images: {str(e)}")
|
119 |
return f"Error extracting images: {str(e)}"
|
120 |
|
121 |
-
def format_to_markdown(text, images
|
122 |
"""Convert extracted text and images to Markdown format."""
|
123 |
-
status_callback("Formatting output as Markdown...")
|
124 |
markdown_output = "# Extracted PDF Content\n\n"
|
125 |
|
126 |
# Clean and format text
|
@@ -143,7 +135,7 @@ def format_to_markdown(text, images, status_callback):
|
|
143 |
ocr_text = pytesseract.image_to_string(image).strip()
|
144 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
145 |
filename = f"image_{i}_{timestamp}"
|
146 |
-
image_url = upload_image_to_hf(image, filename
|
147 |
|
148 |
if not image_url.startswith("Error"):
|
149 |
markdown_output += f"\n"
|
@@ -154,75 +146,72 @@ def format_to_markdown(text, images, status_callback):
|
|
154 |
|
155 |
return markdown_output
|
156 |
|
157 |
-
def process_pdf(pdf_input, pdf_url
|
158 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
|
|
159 |
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
160 |
|
161 |
-
def
|
162 |
-
|
163 |
-
status
|
164 |
-
return status
|
165 |
-
|
166 |
-
status_callback("Starting PDF processing...")
|
167 |
|
168 |
if not HF_TOKEN:
|
169 |
-
|
170 |
-
return "Error: HF_TOKEN not set in Spaces Secrets.", status
|
171 |
|
172 |
# Log poppler status
|
173 |
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
|
|
174 |
|
175 |
# Decode URL-encoded string if provided
|
176 |
if pdf_url and pdf_url.strip():
|
177 |
pdf_url = urllib.parse.unquote(pdf_url)
|
178 |
logger.info(f"Decoded URL: {pdf_url}")
|
179 |
-
|
180 |
try:
|
181 |
response = requests.head(pdf_url, allow_redirects=True)
|
182 |
response.raise_for_status()
|
183 |
pdf_input = pdf_url
|
184 |
except requests.RequestException as e:
|
185 |
logger.error(f"Error accessing URL: {str(e)}")
|
186 |
-
|
187 |
-
return f"Error accessing URL: {str(e)}", status
|
188 |
elif not pdf_input:
|
189 |
-
|
190 |
-
return "Error: Please provide a PDF file or URL.", status
|
191 |
|
192 |
-
text
|
193 |
-
|
|
|
|
|
194 |
|
195 |
if isinstance(text, str) and text.startswith("Error"):
|
196 |
-
|
197 |
-
return text, status
|
198 |
if isinstance(images, str) and images.startswith("Error"):
|
199 |
-
|
200 |
-
return images, status
|
201 |
|
202 |
-
|
203 |
-
|
204 |
-
|
|
|
205 |
|
206 |
# Gradio Interface
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
fn=process_pdf,
|
222 |
-
inputs=[pdf_input, pdf_url, status],
|
223 |
-
outputs=[output, status]
|
224 |
-
)
|
225 |
|
226 |
if __name__ == "__main__":
|
227 |
-
|
228 |
-
iface.launch(share=False)
|
|
|
35 |
def ensure_hf_dataset():
|
36 |
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
|
|
38 |
if not HF_TOKEN:
|
39 |
raise ValueError("HF_TOKEN is not set")
|
40 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
|
|
44 |
logger.error(f"Failed to create/access dataset repo: {str(e)}")
|
45 |
return f"Error: Failed to create/access dataset repo: {str(e)}"
|
46 |
|
47 |
+
def upload_image_to_hf(image, filename):
|
48 |
"""Upload an image to Hugging Face dataset and return its URL."""
|
|
|
49 |
repo_id = ensure_hf_dataset()
|
50 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
51 |
return repo_id
|
|
|
56 |
image.save(temp_path, format="PNG")
|
57 |
|
58 |
# Upload to Hugging Face dataset
|
|
|
59 |
file_url = hf_api.upload_file(
|
60 |
path_or_fileobj=temp_path,
|
61 |
path_in_repo=f"images/{filename}.png",
|
|
|
70 |
logger.error(f"Error uploading image: {str(e)}")
|
71 |
return f"Error uploading image: {str(e)}"
|
72 |
|
73 |
+
def extract_text_from_pdf(pdf_input):
|
74 |
"""Extract text from PDF using pdfplumber."""
|
|
|
75 |
try:
|
76 |
if isinstance(pdf_input, str): # URL case
|
77 |
response = requests.get(pdf_input, stream=True)
|
|
|
92 |
logger.error(f"Error extracting text: {str(e)}")
|
93 |
return f"Error extracting text: {str(e)}"
|
94 |
|
95 |
+
def extract_images_from_pdf(pdf_input):
|
96 |
"""Extract images from PDF and convert to PIL images."""
|
|
|
97 |
if not check_poppler():
|
98 |
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
|
99 |
|
100 |
try:
|
101 |
if isinstance(pdf_input, str): # URL case
|
102 |
logger.info(f"Downloading PDF from URL: {pdf_input}")
|
|
|
103 |
response = requests.get(pdf_input, stream=True)
|
104 |
response.raise_for_status()
|
105 |
images = convert_from_bytes(response.content)
|
106 |
else: # File upload case
|
107 |
logger.info(f"Processing uploaded PDF: {pdf_input.name}")
|
|
|
108 |
images = convert_from_path(pdf_input.name)
|
109 |
return images
|
110 |
except Exception as e:
|
111 |
logger.error(f"Error extracting images: {str(e)}")
|
112 |
return f"Error extracting images: {str(e)}"
|
113 |
|
114 |
+
def format_to_markdown(text, images):
|
115 |
"""Convert extracted text and images to Markdown format."""
|
|
|
116 |
markdown_output = "# Extracted PDF Content\n\n"
|
117 |
|
118 |
# Clean and format text
|
|
|
135 |
ocr_text = pytesseract.image_to_string(image).strip()
|
136 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
137 |
filename = f"image_{i}_{timestamp}"
|
138 |
+
image_url = upload_image_to_hf(image, filename)
|
139 |
|
140 |
if not image_url.startswith("Error"):
|
141 |
markdown_output += f"\n"
|
|
|
146 |
|
147 |
return markdown_output
|
148 |
|
149 |
+
def process_pdf(pdf_input, pdf_url):
|
150 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
151 |
+
status = ["Starting PDF processing..."]
|
152 |
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
153 |
|
154 |
+
def update_status(message):
|
155 |
+
status[0] = message
|
156 |
+
return status[0]
|
|
|
|
|
|
|
157 |
|
158 |
if not HF_TOKEN:
|
159 |
+
update_status("Error: HF_TOKEN not set.")
|
160 |
+
return "Error: HF_TOKEN not set in Spaces Secrets.", status[0]
|
161 |
|
162 |
# Log poppler status
|
163 |
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
164 |
+
update_status("Checking poppler-utils...")
|
165 |
|
166 |
# Decode URL-encoded string if provided
|
167 |
if pdf_url and pdf_url.strip():
|
168 |
pdf_url = urllib.parse.unquote(pdf_url)
|
169 |
logger.info(f"Decoded URL: {pdf_url}")
|
170 |
+
update_status(f"Downloading PDF from URL: {pdf_url}")
|
171 |
try:
|
172 |
response = requests.head(pdf_url, allow_redirects=True)
|
173 |
response.raise_for_status()
|
174 |
pdf_input = pdf_url
|
175 |
except requests.RequestException as e:
|
176 |
logger.error(f"Error accessing URL: {str(e)}")
|
177 |
+
update_status(f"Error accessing URL: {str(e)}")
|
178 |
+
return f"Error accessing URL: {str(e)}", status[0]
|
179 |
elif not pdf_input:
|
180 |
+
update_status("Error: No PDF provided.")
|
181 |
+
return "Error: Please provide a PDF file or URL.", status[0]
|
182 |
|
183 |
+
update_status("Extracting text from PDF...")
|
184 |
+
text = extract_text_from_pdf(pdf_input)
|
185 |
+
update_status("Extracting images from PDF...")
|
186 |
+
images = extract_images_from_pdf(pdf_input)
|
187 |
|
188 |
if isinstance(text, str) and text.startswith("Error"):
|
189 |
+
update_status("Text extraction failed.")
|
190 |
+
return text, status[0]
|
191 |
if isinstance(images, str) and images.startswith("Error"):
|
192 |
+
update_status("Image extraction failed.")
|
193 |
+
return images, status[0]
|
194 |
|
195 |
+
update_status("Formatting output as Markdown...")
|
196 |
+
markdown_output = format_to_markdown(text, images)
|
197 |
+
update_status("Processing complete.")
|
198 |
+
return markdown_output, status[0]
|
199 |
|
200 |
# Gradio Interface
|
201 |
+
iface = gr.Interface(
|
202 |
+
fn=process_pdf,
|
203 |
+
inputs=[
|
204 |
+
gr.File(label="Upload PDF File", type="filepath"),
|
205 |
+
gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
|
206 |
+
],
|
207 |
+
outputs=[
|
208 |
+
gr.Markdown(label="Markdown Output"),
|
209 |
+
gr.Textbox(label="Processing Status", interactive=False),
|
210 |
+
],
|
211 |
+
title="PDF to Markdown Converter",
|
212 |
+
description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
|
213 |
+
allow_flagging="never"
|
214 |
+
)
|
|
|
|
|
|
|
|
|
215 |
|
216 |
if __name__ == "__main__":
|
217 |
+
iface.launch(server_name="0.0.0.0", server_port=7860)
|
|