Spaces:
Sleeping
Sleeping
import gradio as gr | |
from PIL import Image, ImageDraw | |
import requests | |
from io import BytesIO | |
import numpy as np | |
import json | |
import tempfile | |
import easyocr | |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel | |
from bs4 import BeautifulSoup | |
import base64 | |
import re | |
# ----------------- Initialize OCR ----------------- | |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") | |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") | |
reader = easyocr.Reader(['en']) | |
# ----------------- HTML Parsing ----------------- | |
from bs4 import BeautifulSoup | |
from bs4 import BeautifulSoup | |
def parse_html_to_json(html_file): | |
""" | |
Properly parse HTML file uploaded via Gradio. | |
Returns JSON with words and paragraphs like image OCR output. | |
""" | |
html_content = "" | |
try: | |
# Gradio gives a temp file path string for uploaded files | |
if isinstance(html_file, str): | |
with open(html_file, "r", encoding="utf-8") as f: | |
html_content = f.read() | |
elif hasattr(html_file, "read"): # file-like object | |
html_content = html_file.read() | |
if isinstance(html_content, bytes): | |
html_content = html_content.decode("utf-8") | |
else: | |
html_content = str(html_file) | |
except Exception as e: | |
return {"error": f"Cannot read HTML file: {e}"} | |
soup = BeautifulSoup(html_content, "html.parser") | |
words_json = [] | |
paragraphs_json = [] | |
y_offset = 0 | |
line_height = 20 | |
char_width = 10 | |
body = soup.body | |
if not body: | |
body = soup | |
# iterate over all visible text nodes | |
for element in body.find_all(text=True): | |
text = element.strip() | |
if not text: | |
continue | |
line_words = text.split() | |
line_bbox = [0, y_offset, char_width * len(text), y_offset + line_height] | |
word_entries = [] | |
x_offset = 0 | |
for word in line_words: | |
word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height] | |
word_entry = {"text": word, "bbox": word_bbox, "confidence": 1.0} | |
word_entries.append(word_entry) | |
words_json.append(word_entry) | |
x_offset += char_width * (len(word) + 1) | |
paragraphs_json.append({ | |
"text": text, | |
"bbox": line_bbox, | |
"words": word_entries | |
}) | |
y_offset += line_height | |
output_json = { | |
"words": words_json, | |
"paragraphs": paragraphs_json | |
} | |
return output_json | |
# ----------------- Image Loading ----------------- | |
def load_image(image_file, image_url): | |
if image_file: | |
return [image_file] | |
elif image_url: | |
response = requests.get(image_url) | |
return [Image.open(BytesIO(response.content)).convert("RGB")] | |
return [] | |
# ----------------- Main Logic ----------------- | |
def detect_text_combined(image_file, image_url, html_file): | |
# ----------------- HTML Path ----------------- | |
if html_file: | |
output_json = parse_html_to_json(html_file) | |
json_str = json.dumps(output_json, indent=2) | |
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") | |
tmp_file.write(json_str) | |
tmp_file.close() | |
annotated_image = None | |
return annotated_image, json_str, tmp_file.name | |
# ----------------- Image Path ----------------- | |
images = load_image(image_file, image_url) | |
if not images: | |
return None, "No input provided.", None | |
annotated_image = images[0] | |
image = annotated_image | |
results = reader.readtext(np.array(image)) | |
draw = ImageDraw.Draw(image) | |
words_json = [] | |
for bbox, _, conf in results: | |
x_coords = [float(point[0]) for point in bbox] | |
y_coords = [float(point[1]) for point in bbox] | |
x_min, y_min = min(x_coords), min(y_coords) | |
x_max, y_max = max(x_coords), max(y_coords) | |
# Crop word for TrOCR recognition | |
word_crop = image.crop((x_min, y_min, x_max, y_max)) | |
pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values | |
generated_ids = model.generate(pixel_values) | |
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2) | |
words_json.append({ | |
"text": text, | |
"bbox": [x_min, y_min, x_max, y_max], | |
"confidence": float(conf) | |
}) | |
paragraphs_json = words_json.copy() | |
output_json = { | |
"words": words_json, | |
"paragraphs": paragraphs_json | |
} | |
json_str = json.dumps(output_json, indent=2) | |
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") | |
tmp_file.write(json_str) | |
tmp_file.close() | |
return annotated_image, json_str, tmp_file.name | |
# ----------------- Gradio Interface ----------------- | |
iface = gr.Interface( | |
fn=detect_text_combined, | |
inputs=[ | |
gr.Image(type="pil", label="Upload Image"), | |
gr.Textbox(label="Image URL (optional)"), | |
gr.File(label="Upload HTML File", file_types=[".html", ".htm"]) | |
], | |
outputs=[ | |
gr.Image(type="pil", label="Annotated Image"), | |
gr.Textbox(label="JSON Output"), | |
gr.File(label="Download JSON") | |
], | |
title="Combined OCR & HTML Text Bounding Box Extractor", | |
description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and paragraph-level bounding boxes in JSON format consistent with image OCR output." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |