Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -35,6 +35,9 @@ def check_poppler():
|
|
35 |
def ensure_hf_dataset():
|
36 |
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
|
|
|
|
|
|
38 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
39 |
logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
|
40 |
return repo_id
|
@@ -42,8 +45,9 @@ def ensure_hf_dataset():
|
|
42 |
logger.error(f"Failed to create/access dataset repo: {str(e)}")
|
43 |
return f"Error: Failed to create/access dataset repo: {str(e)}"
|
44 |
|
45 |
-
def upload_image_to_hf(image, filename):
|
46 |
"""Upload an image to Hugging Face dataset and return its URL."""
|
|
|
47 |
repo_id = ensure_hf_dataset()
|
48 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
49 |
return repo_id
|
@@ -54,6 +58,7 @@ def upload_image_to_hf(image, filename):
|
|
54 |
image.save(temp_path, format="PNG")
|
55 |
|
56 |
# Upload to Hugging Face dataset
|
|
|
57 |
file_url = hf_api.upload_file(
|
58 |
path_or_fileobj=temp_path,
|
59 |
path_in_repo=f"images/{filename}.png",
|
@@ -81,7 +86,7 @@ def extract_text_from_pdf(pdf_input, status_callback):
|
|
81 |
with pdfplumber.open(pdf_file) as pdf:
|
82 |
text = ""
|
83 |
for page in pdf.pages:
|
84 |
-
page_text = page.extract_text() or ""
|
85 |
text += page_text + "\n\n"
|
86 |
tables = page.extract_tables()
|
87 |
for table in tables:
|
@@ -93,18 +98,20 @@ def extract_text_from_pdf(pdf_input, status_callback):
|
|
93 |
|
94 |
def extract_images_from_pdf(pdf_input, status_callback):
|
95 |
"""Extract images from PDF and convert to PIL images."""
|
96 |
-
status_callback("
|
97 |
if not check_poppler():
|
98 |
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
|
99 |
|
100 |
try:
|
101 |
if isinstance(pdf_input, str): # URL case
|
102 |
logger.info(f"Downloading PDF from URL: {pdf_input}")
|
|
|
103 |
response = requests.get(pdf_input, stream=True)
|
104 |
response.raise_for_status()
|
105 |
images = convert_from_bytes(response.content)
|
106 |
else: # File upload case
|
107 |
logger.info(f"Processing uploaded PDF: {pdf_input.name}")
|
|
|
108 |
images = convert_from_path(pdf_input.name)
|
109 |
return images
|
110 |
except Exception as e:
|
@@ -117,7 +124,7 @@ def format_to_markdown(text, images, status_callback):
|
|
117 |
markdown_output = "# Extracted PDF Content\n\n"
|
118 |
|
119 |
# Clean and format text
|
120 |
-
text = re.sub(r'\n\s*\n', '\n\n', text.strip()) #
|
121 |
lines = text.split("\n")
|
122 |
for line in lines:
|
123 |
# Detect headings (heuristic: all caps or specific keywords)
|
@@ -133,11 +140,10 @@ def format_to_markdown(text, images, status_callback):
|
|
133 |
if isinstance(images, list) and images:
|
134 |
markdown_output += "## Extracted Images\n\n"
|
135 |
for i, image in enumerate(images):
|
136 |
-
status_callback(f"Uploading image {i+1}...")
|
137 |
ocr_text = pytesseract.image_to_string(image).strip()
|
138 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
139 |
filename = f"image_{i}_{timestamp}"
|
140 |
-
image_url = upload_image_to_hf(image, filename)
|
141 |
|
142 |
if not image_url.startswith("Error"):
|
143 |
markdown_output += f"\n"
|
@@ -148,14 +154,20 @@ def format_to_markdown(text, images, status_callback):
|
|
148 |
|
149 |
return markdown_output
|
150 |
|
151 |
-
def process_pdf(pdf_input, pdf_url,
|
152 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
153 |
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
status_callback("Starting PDF processing...")
|
155 |
|
156 |
if not HF_TOKEN:
|
157 |
status_callback("Error: HF_TOKEN not set.")
|
158 |
-
return "Error: HF_TOKEN not set in Spaces Secrets.",
|
159 |
|
160 |
# Log poppler status
|
161 |
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
@@ -172,24 +184,24 @@ def process_pdf(pdf_input, pdf_url, status_callback):
|
|
172 |
except requests.RequestException as e:
|
173 |
logger.error(f"Error accessing URL: {str(e)}")
|
174 |
status_callback(f"Error accessing URL: {str(e)}")
|
175 |
-
return f"Error accessing URL: {str(e)}",
|
176 |
elif not pdf_input:
|
177 |
status_callback("Error: No PDF provided.")
|
178 |
-
return "Error: Please provide a PDF file or URL.",
|
179 |
|
180 |
text = extract_text_from_pdf(pdf_input, status_callback)
|
181 |
images = extract_images_from_pdf(pdf_input, status_callback)
|
182 |
|
183 |
if isinstance(text, str) and text.startswith("Error"):
|
184 |
status_callback("Text extraction failed.")
|
185 |
-
return text,
|
186 |
if isinstance(images, str) and images.startswith("Error"):
|
187 |
status_callback("Image extraction failed.")
|
188 |
-
return images,
|
189 |
|
190 |
markdown_output = format_to_markdown(text, images, status_callback)
|
191 |
status_callback("Processing complete.")
|
192 |
-
return markdown_output,
|
193 |
|
194 |
# Gradio Interface
|
195 |
with gr.Blocks() as iface:
|
@@ -205,12 +217,9 @@ with gr.Blocks() as iface:
|
|
205 |
|
206 |
submit_btn = gr.Button("Process PDF")
|
207 |
|
208 |
-
def update_status(message):
|
209 |
-
return message
|
210 |
-
|
211 |
submit_btn.click(
|
212 |
fn=process_pdf,
|
213 |
-
inputs=[pdf_input, pdf_url,
|
214 |
outputs=[output, status]
|
215 |
)
|
216 |
|
|
|
35 |
def ensure_hf_dataset():
|
36 |
"""Create or get Hugging Face dataset repository."""
|
37 |
try:
|
38 |
+
# Verify token
|
39 |
+
if not HF_TOKEN:
|
40 |
+
raise ValueError("HF_TOKEN is not set")
|
41 |
repo_id = create_repo(repo_id=REPO_NAME, token=HF_TOKEN, repo_type="dataset", exist_ok=True)
|
42 |
logger.info(f"Successfully accessed/created dataset repo: {repo_id}")
|
43 |
return repo_id
|
|
|
45 |
logger.error(f"Failed to create/access dataset repo: {str(e)}")
|
46 |
return f"Error: Failed to create/access dataset repo: {str(e)}"
|
47 |
|
48 |
+
def upload_image_to_hf(image, filename, status_callback):
|
49 |
"""Upload an image to Hugging Face dataset and return its URL."""
|
50 |
+
status_callback("Checking dataset access...")
|
51 |
repo_id = ensure_hf_dataset()
|
52 |
if isinstance(repo_id, str) and repo_id.startswith("Error"):
|
53 |
return repo_id
|
|
|
58 |
image.save(temp_path, format="PNG")
|
59 |
|
60 |
# Upload to Hugging Face dataset
|
61 |
+
status_callback(f"Uploading image {filename}...")
|
62 |
file_url = hf_api.upload_file(
|
63 |
path_or_fileobj=temp_path,
|
64 |
path_in_repo=f"images/{filename}.png",
|
|
|
86 |
with pdfplumber.open(pdf_file) as pdf:
|
87 |
text = ""
|
88 |
for page in pdf.pages:
|
89 |
+
page_text = page.extract_text(layout=True) or ""
|
90 |
text += page_text + "\n\n"
|
91 |
tables = page.extract_tables()
|
92 |
for table in tables:
|
|
|
98 |
|
99 |
def extract_images_from_pdf(pdf_input, status_callback):
|
100 |
"""Extract images from PDF and convert to PIL images."""
|
101 |
+
status_callback("Checking poppler-utils...")
|
102 |
if not check_poppler():
|
103 |
return "Error: poppler-utils not found. Ensure it is installed via Dockerfile."
|
104 |
|
105 |
try:
|
106 |
if isinstance(pdf_input, str): # URL case
|
107 |
logger.info(f"Downloading PDF from URL: {pdf_input}")
|
108 |
+
status_callback("Downloading PDF for image extraction...")
|
109 |
response = requests.get(pdf_input, stream=True)
|
110 |
response.raise_for_status()
|
111 |
images = convert_from_bytes(response.content)
|
112 |
else: # File upload case
|
113 |
logger.info(f"Processing uploaded PDF: {pdf_input.name}")
|
114 |
+
status_callback("Extracting images from uploaded PDF...")
|
115 |
images = convert_from_path(pdf_input.name)
|
116 |
return images
|
117 |
except Exception as e:
|
|
|
124 |
markdown_output = "# Extracted PDF Content\n\n"
|
125 |
|
126 |
# Clean and format text
|
127 |
+
text = re.sub(r'\n\s*\n+', '\n\n', text.strip()) # Normalize newlines
|
128 |
lines = text.split("\n")
|
129 |
for line in lines:
|
130 |
# Detect headings (heuristic: all caps or specific keywords)
|
|
|
140 |
if isinstance(images, list) and images:
|
141 |
markdown_output += "## Extracted Images\n\n"
|
142 |
for i, image in enumerate(images):
|
|
|
143 |
ocr_text = pytesseract.image_to_string(image).strip()
|
144 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
145 |
filename = f"image_{i}_{timestamp}"
|
146 |
+
image_url = upload_image_to_hf(image, filename, status_callback)
|
147 |
|
148 |
if not image_url.startswith("Error"):
|
149 |
markdown_output += f"\n"
|
|
|
154 |
|
155 |
return markdown_output
|
156 |
|
157 |
+
def process_pdf(pdf_input, pdf_url, status):
|
158 |
"""Main function to process PDF input (file or URL) and generate Markdown."""
|
159 |
logger.info("Starting PDF processing at %s", datetime.now().strftime("%Y-%m-%d %H:%M:%S PDT"))
|
160 |
+
|
161 |
+
def status_callback(message):
|
162 |
+
nonlocal status
|
163 |
+
status = message
|
164 |
+
return status
|
165 |
+
|
166 |
status_callback("Starting PDF processing...")
|
167 |
|
168 |
if not HF_TOKEN:
|
169 |
status_callback("Error: HF_TOKEN not set.")
|
170 |
+
return "Error: HF_TOKEN not set in Spaces Secrets.", status
|
171 |
|
172 |
# Log poppler status
|
173 |
logger.info(f"Poppler check: {'Found' if check_poppler() else 'Not found'}")
|
|
|
184 |
except requests.RequestException as e:
|
185 |
logger.error(f"Error accessing URL: {str(e)}")
|
186 |
status_callback(f"Error accessing URL: {str(e)}")
|
187 |
+
return f"Error accessing URL: {str(e)}", status
|
188 |
elif not pdf_input:
|
189 |
status_callback("Error: No PDF provided.")
|
190 |
+
return "Error: Please provide a PDF file or URL.", status
|
191 |
|
192 |
text = extract_text_from_pdf(pdf_input, status_callback)
|
193 |
images = extract_images_from_pdf(pdf_input, status_callback)
|
194 |
|
195 |
if isinstance(text, str) and text.startswith("Error"):
|
196 |
status_callback("Text extraction failed.")
|
197 |
+
return text, status
|
198 |
if isinstance(images, str) and images.startswith("Error"):
|
199 |
status_callback("Image extraction failed.")
|
200 |
+
return images, status
|
201 |
|
202 |
markdown_output = format_to_markdown(text, images, status_callback)
|
203 |
status_callback("Processing complete.")
|
204 |
+
return markdown_output, status
|
205 |
|
206 |
# Gradio Interface
|
207 |
with gr.Blocks() as iface:
|
|
|
217 |
|
218 |
submit_btn = gr.Button("Process PDF")
|
219 |
|
|
|
|
|
|
|
220 |
submit_btn.click(
|
221 |
fn=process_pdf,
|
222 |
+
inputs=[pdf_input, pdf_url, status],
|
223 |
outputs=[output, status]
|
224 |
)
|
225 |
|