broadfield-dev commited on
Commit
9db742a
·
verified ·
1 Parent(s): aec5733

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -17
app.py CHANGED
@@ -35,6 +35,9 @@ def check_poppler():
35
  def ensure_hf_dataset():
36
  """Create or get Hugging Face dataset repository."""
37
  try:
 
 
 
38
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
39
  logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
40
  return repo_id
@@ -42,8 +45,9 @@ def ensure_hf_dataset():
42
  logger.error(f"Failed to create/access dataset repo: {str(e)}")
43
  return f"Error: Failed to create/access dataset repo: {str(e)}"
44
 
45
- def upload_image_to_hf(image, filename):
46
  """Upload an image to Hugging Face dataset and return its URL."""
 
47
  repo_id = ensure_hf_dataset()
48
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
49
  return repo_id
@@ -54,6 +58,7 @@ def upload_image_to_hf(image, filename):
54
  image.save(temp_path, format="PNG")
55
 
56
  # Upload to Hugging Face dataset
 
57
  file_url = hf_api.upload_file(
58
  path_or_fileobj=temp_path,
59
  path_in_repo=f"images/{filename}.png",
@@ -81,7 +86,7 @@ def extract_text_from_pdf(pdf_input, status_callback):
81
  with pdfplumber.open(pdf_file) as pdf:
82
  text = ""
83
  for page in pdf.pages:
84
- page_text = page.extract_text() or ""
85
  text += page_text + "\n\n"
86
  tables = page.extract_tables()
87
  for table in tables:
@@ -93,18 +98,20 @@ def extract_text_from_pdf(pdf_input, status_callback):
93
 
94
  def extract_images_from_pdf(pdf_input, status_callback):
95
  """Extract images from PDF and convert to PIL images."""
96
- status_callback("Extracting images from PDF...")
97
  if not check_poppler():
98
  return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
99
 
100
  try:
101
  if isinstance(pdf_input, str): # URL case
102
  logger.info(f"Downloading PDF from URL: {pdf_input}")
 
103
  response = requests.get(pdf_input, stream=True)
104
  response.raise_for_status()
105
  images = convert_from_bytes(response.content)
106
  else: # File upload case
107
  logger.info(f"Processing uploaded PDF: {pdf_input.name}")
 
108
  images = convert_from_path(pdf_input.name)
109
  return images
110
  except Exception as e:
@@ -117,7 +124,7 @@ def format_to_markdown(text, images, status_callback):
117
  markdown_output = "# Extracted PDF Content\n\n"
118
 
119
  # Clean and format text
120
- text = re.sub(r'\n\s*\n', '\n\n', text.strip()) # Remove excessive newlines
121
  lines = text.split("\n")
122
  for line in lines:
123
  # Detect headings (heuristic: all caps or specific keywords)
@@ -133,11 +140,10 @@ def format_to_markdown(text, images, status_callback):
133
  if isinstance(images, list) and images:
134
  markdown_output += "## Extracted Images\n\n"
135
  for i, image in enumerate(images):
136
- status_callback(f"Uploading image {i+1}...")
137
  ocr_text = pytesseract.image_to_string(image).strip()
138
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
139
  filename = f"image_{i}_{timestamp}"
140
- image_url = upload_image_to_hf(image, filename)
141
 
142
  if not image_url.startswith("Error"):
143
  markdown_output += f"![Image {i+1}]({image_url})\n"
@@ -148,14 +154,20 @@ def format_to_markdown(text, images, status_callback):
148
 
149
  return markdown_output
150
 
151
- def process_pdf(pdf_input, pdf_url, status_callback):
152
  """Main function to process PDF input (file or URL) and generate Markdown."""
153
  logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
 
 
 
 
 
 
154
  status_callback("Starting PDF processing...")
155
 
156
  if not HF_TOKEN:
157
  status_callback("Error: HF_TOKEN not set.")
158
- return "Error: HF_TOKEN not set in Spaces Secrets.", ""
159
 
160
  # Log poppler status
161
  logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
@@ -172,24 +184,24 @@ def process_pdf(pdf_input, pdf_url, status_callback):
172
  except requests.RequestException as e:
173
  logger.error(f"Error accessing URL: {str(e)}")
174
  status_callback(f"Error accessing URL: {str(e)}")
175
- return f"Error accessing URL: {str(e)}", ""
176
  elif not pdf_input:
177
  status_callback("Error: No PDF provided.")
178
- return "Error: Please provide a PDF file or URL.", ""
179
 
180
  text = extract_text_from_pdf(pdf_input, status_callback)
181
  images = extract_images_from_pdf(pdf_input, status_callback)
182
 
183
  if isinstance(text, str) and text.startswith("Error"):
184
  status_callback("Text extraction failed.")
185
- return text, ""
186
  if isinstance(images, str) and images.startswith("Error"):
187
  status_callback("Image extraction failed.")
188
- return images, ""
189
 
190
  markdown_output = format_to_markdown(text, images, status_callback)
191
  status_callback("Processing complete.")
192
- return markdown_output, ""
193
 
194
  # Gradio Interface
195
  with gr.Blocks() as iface:
@@ -205,12 +217,9 @@ with gr.Blocks() as iface:
205
 
206
  submit_btn = gr.Button("Process PDF")
207
 
208
- def update_status(message):
209
- return message
210
-
211
  submit_btn.click(
212
  fn=process_pdf,
213
- inputs=[pdf_input, pdf_url, update_status],
214
  outputs=[output, status]
215
  )
216
 
 
35
  def ensure_hf_dataset():
36
  """Create or get Hugging Face dataset repository."""
37
  try:
38
+ # Verify token
39
+ if not HF_TOKEN:
40
+ raise ValueError("HF_TOKEN is not set")
41
  repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
42
  logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
43
  return repo_id
 
45
  logger.error(f"Failed to create/access dataset repo: {str(e)}")
46
  return f"Error: Failed to create/access dataset repo: {str(e)}"
47
 
48
+ def upload_image_to_hf(image, filename, status_callback):
49
  """Upload an image to Hugging Face dataset and return its URL."""
50
+ status_callback("Checking dataset access...")
51
  repo_id = ensure_hf_dataset()
52
  if isinstance(repo_id, str) and repo_id.startswith("Error"):
53
  return repo_id
 
58
  image.save(temp_path, format="PNG")
59
 
60
  # Upload to Hugging Face dataset
61
+ status_callback(f"Uploading image {filename}...")
62
  file_url = hf_api.upload_file(
63
  path_or_fileobj=temp_path,
64
  path_in_repo=f"images/{filename}.png",
 
86
  with pdfplumber.open(pdf_file) as pdf:
87
  text = ""
88
  for page in pdf.pages:
89
+ page_text = page.extract_text(layout=True) or ""
90
  text += page_text + "\n\n"
91
  tables = page.extract_tables()
92
  for table in tables:
 
98
 
99
  def extract_images_from_pdf(pdf_input, status_callback):
100
  """Extract images from PDF and convert to PIL images."""
101
+ status_callback("Checking poppler-utils...")
102
  if not check_poppler():
103
  return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
104
 
105
  try:
106
  if isinstance(pdf_input, str): # URL case
107
  logger.info(f"Downloading PDF from URL: {pdf_input}")
108
+ status_callback("Downloading PDF for image extraction...")
109
  response = requests.get(pdf_input, stream=True)
110
  response.raise_for_status()
111
  images = convert_from_bytes(response.content)
112
  else: # File upload case
113
  logger.info(f"Processing uploaded PDF: {pdf_input.name}")
114
+ status_callback("Extracting images from uploaded PDF...")
115
  images = convert_from_path(pdf_input.name)
116
  return images
117
  except Exception as e:
 
124
  markdown_output = "# Extracted PDF Content\n\n"
125
 
126
  # Clean and format text
127
+ text = re.sub(r'\n\s*\n+', '\n\n', text.strip()) # Normalize newlines
128
  lines = text.split("\n")
129
  for line in lines:
130
  # Detect headings (heuristic: all caps or specific keywords)
 
140
  if isinstance(images, list) and images:
141
  markdown_output += "## Extracted Images\n\n"
142
  for i, image in enumerate(images):
 
143
  ocr_text = pytesseract.image_to_string(image).strip()
144
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
145
  filename = f"image_{i}_{timestamp}"
146
+ image_url = upload_image_to_hf(image, filename, status_callback)
147
 
148
  if not image_url.startswith("Error"):
149
  markdown_output += f"![Image {i+1}]({image_url})\n"
 
154
 
155
  return markdown_output
156
 
157
+ def process_pdf(pdf_input, pdf_url, status):
158
  """Main function to process PDF input (file or URL) and generate Markdown."""
159
  logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
160
+
161
+ def status_callback(message):
162
+ nonlocal status
163
+ status = message
164
+ return status
165
+
166
  status_callback("Starting PDF processing...")
167
 
168
  if not HF_TOKEN:
169
  status_callback("Error: HF_TOKEN not set.")
170
+ return "Error: HF_TOKEN not set in Spaces Secrets.", status
171
 
172
  # Log poppler status
173
  logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
 
184
  except requests.RequestException as e:
185
  logger.error(f"Error accessing URL: {str(e)}")
186
  status_callback(f"Error accessing URL: {str(e)}")
187
+ return f"Error accessing URL: {str(e)}", status
188
  elif not pdf_input:
189
  status_callback("Error: No PDF provided.")
190
+ return "Error: Please provide a PDF file or URL.", status
191
 
192
  text = extract_text_from_pdf(pdf_input, status_callback)
193
  images = extract_images_from_pdf(pdf_input, status_callback)
194
 
195
  if isinstance(text, str) and text.startswith("Error"):
196
  status_callback("Text extraction failed.")
197
+ return text, status
198
  if isinstance(images, str) and images.startswith("Error"):
199
  status_callback("Image extraction failed.")
200
+ return images, status
201
 
202
  markdown_output = format_to_markdown(text, images, status_callback)
203
  status_callback("Processing complete.")
204
+ return markdown_output, status
205
 
206
  # Gradio Interface
207
  with gr.Blocks() as iface:
 
217
 
218
  submit_btn = gr.Button("Process PDF")
219
 
 
 
 
220
  submit_btn.click(
221
  fn=process_pdf,
222
+ inputs=[pdf_input, pdf_url, status],
223
  outputs=[output, status]
224
  )
225