broadfield-dev commited on
Commit
a492eda
·
verified ·
1 Parent(s): 9db742a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -56
app.py CHANGED
@@ -35,7 +35,6 @@ def check_poppler():
35
  def ensure_hf_dataset():
36
  """Create or get Hugging Face dataset repository."""
37
  try:
38
- # Verify token
39
  if not HF_TOKEN:
40
  raise ValueError("HF_TOKEN is not set")
41
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
@@ -45,9 +44,8 @@ def ensure_hf_dataset():
45
  logger.error(f"Failed to create/access dataset repo: {str(e)}")
46
  return f"Error: Failed to create/access dataset repo: {str(e)}"
47
 
48
- def upload_image_to_hf(image, filename, status_callback):
49
  """Upload an image to Hugging Face dataset and return its URL."""
50
- status_callback("Checking dataset access...")
51
  repo_id = ensure_hf_dataset()
52
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
53
  return repo_id
@@ -58,7 +56,6 @@ def upload_image_to_hf(image, filename, status_callback):
58
  image.save(temp_path, format="PNG")
59
 
60
  # Upload to Hugging Face dataset
61
- status_callback(f"Uploading image {filename}...")
62
  file_url = hf_api.upload_file(
63
  path_or_fileobj=temp_path,
64
  path_in_repo=f"images/{filename}.png",
@@ -73,9 +70,8 @@ def upload_image_to_hf(image, filename, status_callback):
73
  logger.error(f"Error uploading image: {str(e)}")
74
  return f"Error uploading image: {str(e)}"
75
 
76
- def extract_text_from_pdf(pdf_input, status_callback):
77
  """Extract text from PDF using pdfplumber."""
78
- status_callback("Extracting text from PDF...")
79
  try:
80
  if isinstance(pdf_input, str): # URL case
81
  response = requests.get(pdf_input, stream=True)
@@ -96,31 +92,27 @@ def extract_text_from_pdf(pdf_input, status_callback):
96
  logger.error(f"Error extracting text: {str(e)}")
97
  return f"Error extracting text: {str(e)}"
98
 
99
- def extract_images_from_pdf(pdf_input, status_callback):
100
  """Extract images from PDF and convert to PIL images."""
101
- status_callback("Checking poppler-utils...")
102
  if not check_poppler():
103
  return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
104
 
105
  try:
106
  if isinstance(pdf_input, str): # URL case
107
  logger.info(f"Downloading PDF from URL: {pdf_input}")
108
- status_callback("Downloading PDF for image extraction...")
109
  response = requests.get(pdf_input, stream=True)
110
  response.raise_for_status()
111
  images = convert_from_bytes(response.content)
112
  else: # File upload case
113
  logger.info(f"Processing uploaded PDF: {pdf_input.name}")
114
- status_callback("Extracting images from uploaded PDF...")
115
  images = convert_from_path(pdf_input.name)
116
  return images
117
  except Exception as e:
118
  logger.error(f"Error extracting images: {str(e)}")
119
  return f"Error extracting images: {str(e)}"
120
 
121
- def format_to_markdown(text, images, status_callback):
122
  """Convert extracted text and images to Markdown format."""
123
- status_callback("Formatting output as Markdown...")
124
  markdown_output = "# Extracted PDF Content\n\n"
125
 
126
  # Clean and format text
@@ -143,7 +135,7 @@ def format_to_markdown(text, images, status_callback):
143
  ocr_text = pytesseract.image_to_string(image).strip()
144
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
145
  filename = f"image_{i}_{timestamp}"
146
- image_url = upload_image_to_hf(image, filename, status_callback)
147
 
148
  if not image_url.startswith("Error"):
149
  markdown_output += f"![Image {i+1}]({image_url})\n"
@@ -154,75 +146,72 @@ def format_to_markdown(text, images, status_callback):
154
 
155
  return markdown_output
156
 
157
- def process_pdf(pdf_input, pdf_url, status):
158
  """Main function to process PDF input (file or URL) and generate Markdown."""
 
159
  logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
160
 
161
- def status_callback(message):
162
- nonlocal status
163
- status = message
164
- return status
165
-
166
- status_callback("Starting PDF processing...")
167
 
168
  if not HF_TOKEN:
169
- status_callback("Error: HF_TOKEN not set.")
170
- return "Error: HF_TOKEN not set in Spaces Secrets.", status
171
 
172
  # Log poppler status
173
  logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
 
174
 
175
  # Decode URL-encoded string if provided
176
  if pdf_url and pdf_url.strip():
177
  pdf_url = urllib.parse.unquote(pdf_url)
178
  logger.info(f"Decoded URL: {pdf_url}")
179
- status_callback(f"Downloading PDF from URL: {pdf_url}")
180
  try:
181
  response = requests.head(pdf_url, allow_redirects=True)
182
  response.raise_for_status()
183
  pdf_input = pdf_url
184
  except requests.RequestException as e:
185
  logger.error(f"Error accessing URL: {str(e)}")
186
- status_callback(f"Error accessing URL: {str(e)}")
187
- return f"Error accessing URL: {str(e)}", status
188
  elif not pdf_input:
189
- status_callback("Error: No PDF provided.")
190
- return "Error: Please provide a PDF file or URL.", status
191
 
192
- text = extract_text_from_pdf(pdf_input, status_callback)
193
- images = extract_images_from_pdf(pdf_input, status_callback)
 
 
194
 
195
  if isinstance(text, str) and text.startswith("Error"):
196
- status_callback("Text extraction failed.")
197
- return text, status
198
  if isinstance(images, str) and images.startswith("Error"):
199
- status_callback("Image extraction failed.")
200
- return images, status
201
 
202
- markdown_output = format_to_markdown(text, images, status_callback)
203
- status_callback("Processing complete.")
204
- return markdown_output, status
 
205
 
206
  # Gradio Interface
207
- with gr.Blocks() as iface:
208
- gr.Markdown("# PDF to Markdown Converter")
209
- gr.Markdown("Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.")
210
-
211
- with gr.Row():
212
- pdf_input = gr.File(label="Upload PDF File", type="filepath")
213
- pdf_url = gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF")
214
-
215
- status = gr.Textbox(label="Processing Status", interactive=False)
216
- output = gr.Markdown(label="Markdown Output")
217
-
218
- submit_btn = gr.Button("Process PDF")
219
-
220
- submit_btn.click(
221
- fn=process_pdf,
222
- inputs=[pdf_input, pdf_url, status],
223
- outputs=[output, status]
224
- )
225
 
226
  if __name__ == "__main__":
227
- # In Hugging Face Spaces, share=False is sufficient as Spaces handles the server
228
- iface.launch(share=False)
 
35
  def ensure_hf_dataset():
36
  """Create or get Hugging Face dataset repository."""
37
  try:
 
38
  if not HF_TOKEN:
39
  raise ValueError("HF_TOKEN is not set")
40
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
 
44
  logger.error(f"Failed to create/access dataset repo: {str(e)}")
45
  return f"Error: Failed to create/access dataset repo: {str(e)}"
46
 
47
+ def upload_image_to_hf(image, filename):
48
  """Upload an image to Hugging Face dataset and return its URL."""
 
49
  repo_id = ensure_hf_dataset()
50
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
51
  return repo_id
 
56
  image.save(temp_path, format="PNG")
57
 
58
  # Upload to Hugging Face dataset
 
59
  file_url = hf_api.upload_file(
60
  path_or_fileobj=temp_path,
61
  path_in_repo=f"images/{filename}.png",
 
70
  logger.error(f"Error uploading image: {str(e)}")
71
  return f"Error uploading image: {str(e)}"
72
 
73
+ def extract_text_from_pdf(pdf_input):
74
  """Extract text from PDF using pdfplumber."""
 
75
  try:
76
  if isinstance(pdf_input, str): # URL case
77
  response = requests.get(pdf_input, stream=True)
 
92
  logger.error(f"Error extracting text: {str(e)}")
93
  return f"Error extracting text: {str(e)}"
94
 
95
+ def extract_images_from_pdf(pdf_input):
96
  """Extract images from PDF and convert to PIL images."""
 
97
  if not check_poppler():
98
  return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
99
 
100
  try:
101
  if isinstance(pdf_input, str): # URL case
102
  logger.info(f"Downloading PDF from URL: {pdf_input}")
 
103
  response = requests.get(pdf_input, stream=True)
104
  response.raise_for_status()
105
  images = convert_from_bytes(response.content)
106
  else: # File upload case
107
  logger.info(f"Processing uploaded PDF: {pdf_input.name}")
 
108
  images = convert_from_path(pdf_input.name)
109
  return images
110
  except Exception as e:
111
  logger.error(f"Error extracting images: {str(e)}")
112
  return f"Error extracting images: {str(e)}"
113
 
114
+ def format_to_markdown(text, images):
115
  """Convert extracted text and images to Markdown format."""
 
116
  markdown_output = "# Extracted PDF Content\n\n"
117
 
118
  # Clean and format text
 
135
  ocr_text = pytesseract.image_to_string(image).strip()
136
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
137
  filename = f"image_{i}_{timestamp}"
138
+ image_url = upload_image_to_hf(image, filename)
139
 
140
  if not image_url.startswith("Error"):
141
  markdown_output += f"![Image {i+1}]({image_url})\n"
 
146
 
147
  return markdown_output
148
 
149
+ def process_pdf(pdf_input, pdf_url):
150
  """Main function to process PDF input (file or URL) and generate Markdown."""
151
+ status = ["Starting PDF processing..."]
152
  logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
153
 
154
+ def update_status(message):
155
+ status[0] = message
156
+ return status[0]
 
 
 
157
 
158
  if not HF_TOKEN:
159
+ update_status("Error: HF_TOKEN not set.")
160
+ return "Error: HF_TOKEN not set in Spaces Secrets.", status[0]
161
 
162
  # Log poppler status
163
  logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
164
+ update_status("Checking poppler-utils...")
165
 
166
  # Decode URL-encoded string if provided
167
  if pdf_url and pdf_url.strip():
168
  pdf_url = urllib.parse.unquote(pdf_url)
169
  logger.info(f"Decoded URL: {pdf_url}")
170
+ update_status(f"Downloading PDF from URL: {pdf_url}")
171
  try:
172
  response = requests.head(pdf_url, allow_redirects=True)
173
  response.raise_for_status()
174
  pdf_input = pdf_url
175
  except requests.RequestException as e:
176
  logger.error(f"Error accessing URL: {str(e)}")
177
+ update_status(f"Error accessing URL: {str(e)}")
178
+ return f"Error accessing URL: {str(e)}", status[0]
179
  elif not pdf_input:
180
+ update_status("Error: No PDF provided.")
181
+ return "Error: Please provide a PDF file or URL.", status[0]
182
 
183
+ update_status("Extracting text from PDF...")
184
+ text = extract_text_from_pdf(pdf_input)
185
+ update_status("Extracting images from PDF...")
186
+ images = extract_images_from_pdf(pdf_input)
187
 
188
  if isinstance(text, str) and text.startswith("Error"):
189
+ update_status("Text extraction failed.")
190
+ return text, status[0]
191
  if isinstance(images, str) and images.startswith("Error"):
192
+ update_status("Image extraction failed.")
193
+ return images, status[0]
194
 
195
+ update_status("Formatting output as Markdown...")
196
+ markdown_output = format_to_markdown(text, images)
197
+ update_status("Processing complete.")
198
+ return markdown_output, status[0]
199
 
200
  # Gradio Interface
201
+ iface = gr.Interface(
202
+ fn=process_pdf,
203
+ inputs=[
204
+ gr.File(label="Upload PDF File", type="filepath"),
205
+ gr.Textbox(label="PDF URL", placeholder="Enter the URL of the PDF"),
206
+ ],
207
+ outputs=[
208
+ gr.Markdown(label="Markdown Output"),
209
+ gr.Textbox(label="Processing Status", interactive=False),
210
+ ],
211
+ title="PDF to Markdown Converter",
212
+ description="Upload a PDF file or provide a PDF URL (including URL-encoded strings with spaces) to convert it into a Markdown document. Images and charts are extracted, uploaded to a Hugging Face dataset, and linked in the Markdown. Formatting (e.g., headings, lists) is preserved. Requires HF_TOKEN in Spaces Secrets.",
213
+ allow_flagging="never"
214
+ )
 
 
 
 
215
 
216
  if __name__ == "__main__":
217
+ iface.launch(server_name="0.0.0.0", server_port=7860)