Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -7,82 +7,164 @@ import json
|
|
7 |
import tempfile
|
8 |
import easyocr
|
9 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
|
|
|
|
|
|
10 |
|
11 |
-
#
|
12 |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
|
13 |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
|
14 |
-
|
15 |
-
# EasyOCR reader for bounding boxes
|
16 |
reader = easyocr.Reader(['en'])
|
17 |
|
18 |
-
def
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
results = reader.readtext(np.array(image))
|
32 |
-
draw = ImageDraw.Draw(image)
|
33 |
words_json = []
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
})
|
55 |
|
56 |
-
|
57 |
-
paragraphs_json = words_json.copy()
|
58 |
|
59 |
output_json = {
|
60 |
"words": words_json,
|
61 |
-
"
|
62 |
}
|
63 |
|
64 |
-
|
65 |
-
|
66 |
-
# Save JSON to a temporary file for Gradio download
|
67 |
-
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w')
|
68 |
-
tmp_file.write(json_str)
|
69 |
-
tmp_file.close()
|
70 |
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
iface = gr.Interface(
|
74 |
-
fn=
|
75 |
inputs=[
|
76 |
gr.Image(type="pil", label="Upload Image"),
|
77 |
-
gr.Textbox(label="Image URL (optional)")
|
|
|
78 |
],
|
79 |
outputs=[
|
80 |
-
gr.Image(type="pil", label="Annotated Image"),
|
81 |
-
gr.Textbox(label="
|
82 |
gr.File(label="Download JSON")
|
83 |
],
|
84 |
-
title="
|
85 |
-
description="
|
86 |
)
|
87 |
|
88 |
if __name__ == "__main__":
|
|
|
7 |
import tempfile
|
8 |
import easyocr
|
9 |
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
import base64
|
12 |
+
import re
|
13 |
|
14 |
+
# Initialize OCR models
|
15 |
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
|
16 |
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
|
|
|
|
|
17 |
reader = easyocr.Reader(['en'])
|
18 |
|
19 |
+
def extract_images_from_html(html_file):
|
20 |
+
"""Extract images from HTML file (base64 or URLs)"""
|
21 |
+
images = []
|
22 |
+
soup = BeautifulSoup(html_file.read(), "html.parser")
|
23 |
+
for img_tag in soup.find_all("img"):
|
24 |
+
src = img_tag.get("src")
|
25 |
+
if not src:
|
26 |
+
continue
|
27 |
+
if src.startswith("data:image"):
|
28 |
+
b64_data = re.sub(r"^data:image/.+;base64,", "", src)
|
29 |
+
image = Image.open(BytesIO(base64.b64decode(b64_data))).convert("RGB")
|
30 |
+
images.append(image)
|
31 |
+
else:
|
32 |
+
try:
|
33 |
+
response = requests.get(src)
|
34 |
+
image = Image.open(BytesIO(response.content)).convert("RGB")
|
35 |
+
images.append(image)
|
36 |
+
except:
|
37 |
+
continue
|
38 |
+
return images
|
39 |
+
|
40 |
+
def parse_html_text(html_file):
|
41 |
+
"""Parse HTML text and generate approximate bounding boxes"""
|
42 |
+
html_content = html_file.read().decode("utf-8")
|
43 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
44 |
+
body_text = soup.get_text(separator="\n")
|
45 |
+
lines = [line.strip() for line in body_text.split("\n") if line.strip()]
|
46 |
|
|
|
|
|
47 |
words_json = []
|
48 |
+
lines_json = []
|
49 |
+
|
50 |
+
y_offset = 0
|
51 |
+
line_height = 20
|
52 |
+
char_width = 10
|
53 |
+
|
54 |
+
for line in lines:
|
55 |
+
line_words = line.split()
|
56 |
+
line_bbox = [0, y_offset, char_width * len(line), y_offset + line_height]
|
57 |
+
|
58 |
+
word_entries = []
|
59 |
+
x_offset = 0
|
60 |
+
for word in line_words:
|
61 |
+
word_bbox = [x_offset, y_offset, x_offset + char_width * len(word), y_offset + line_height]
|
62 |
+
word_entries.append({
|
63 |
+
"text": word,
|
64 |
+
"bbox": word_bbox
|
65 |
+
})
|
66 |
+
words_json.append({
|
67 |
+
"text": word,
|
68 |
+
"bbox": word_bbox
|
69 |
+
})
|
70 |
+
x_offset += char_width * (len(word) + 1)
|
71 |
+
|
72 |
+
lines_json.append({
|
73 |
+
"text": line,
|
74 |
+
"bbox": line_bbox,
|
75 |
+
"words": word_entries
|
76 |
})
|
77 |
|
78 |
+
y_offset += line_height
|
|
|
79 |
|
80 |
output_json = {
|
81 |
"words": words_json,
|
82 |
+
"lines": lines_json
|
83 |
}
|
84 |
|
85 |
+
return html_content, output_json
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
+
def load_image(image_file, image_url):
|
88 |
+
if image_file:
|
89 |
+
return [image_file]
|
90 |
+
elif image_url:
|
91 |
+
response = requests.get(image_url)
|
92 |
+
return [Image.open(BytesIO(response.content)).convert("RGB")]
|
93 |
+
return []
|
94 |
+
|
95 |
+
def detect_text_combined(image_file, image_url, html_file):
|
96 |
+
# HTML path
|
97 |
+
if html_file:
|
98 |
+
html_content, output_json = parse_html_text(html_file)
|
99 |
+
json_str = json.dumps(output_json, indent=2)
|
100 |
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
|
101 |
+
tmp_file.write(json_str)
|
102 |
+
tmp_file.close()
|
103 |
+
return html_content, json_str, tmp_file.name
|
104 |
+
|
105 |
+
# Image path
|
106 |
+
images = load_image(image_file, image_url)
|
107 |
+
if not images:
|
108 |
+
return None, "No input provided.", None
|
109 |
+
|
110 |
+
all_output_json = []
|
111 |
+
annotated_images = []
|
112 |
+
|
113 |
+
for image in images:
|
114 |
+
results = reader.readtext(np.array(image))
|
115 |
+
draw = ImageDraw.Draw(image)
|
116 |
+
words_json = []
|
117 |
+
|
118 |
+
for bbox, _, conf in results:
|
119 |
+
x_coords = [float(point[0]) for point in bbox]
|
120 |
+
y_coords = [float(point[1]) for point in bbox]
|
121 |
+
x_min, y_min = min(x_coords), min(y_coords)
|
122 |
+
x_max, y_max = max(x_coords), max(y_coords)
|
123 |
+
|
124 |
+
# Crop word for TrOCR recognition
|
125 |
+
word_crop = image.crop((x_min, y_min, x_max, y_max))
|
126 |
+
pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
|
127 |
+
generated_ids = model.generate(pixel_values)
|
128 |
+
text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
129 |
+
|
130 |
+
draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
|
131 |
+
|
132 |
+
words_json.append({
|
133 |
+
"text": text,
|
134 |
+
"bbox": [x_min, y_min, x_max, y_max],
|
135 |
+
"confidence": float(conf)
|
136 |
+
})
|
137 |
+
|
138 |
+
paragraphs_json = words_json.copy()
|
139 |
+
output_json = {
|
140 |
+
"words": words_json,
|
141 |
+
"paragraphs": paragraphs_json
|
142 |
+
}
|
143 |
+
json_str = json.dumps(output_json, indent=2)
|
144 |
+
|
145 |
+
tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
|
146 |
+
tmp_file.write(json_str)
|
147 |
+
tmp_file.close()
|
148 |
+
|
149 |
+
annotated_images.append((image, json_str, tmp_file.name))
|
150 |
+
|
151 |
+
# Return first image for simplicity (can extend to gallery)
|
152 |
+
return annotated_images[0]
|
153 |
|
154 |
iface = gr.Interface(
|
155 |
+
fn=detect_text_combined,
|
156 |
inputs=[
|
157 |
gr.Image(type="pil", label="Upload Image"),
|
158 |
+
gr.Textbox(label="Image URL (optional)"),
|
159 |
+
gr.File(label="Upload HTML File", file_types=[".html", ".htm"])
|
160 |
],
|
161 |
outputs=[
|
162 |
+
gr.Image(type="pil", label="Annotated Image / N/A for HTML"),
|
163 |
+
gr.Textbox(label="JSON Output"),
|
164 |
gr.File(label="Download JSON")
|
165 |
],
|
166 |
+
title="Combined OCR & HTML Text Bounding Box Extractor",
|
167 |
+
description="Upload an image, provide an image URL, or upload an HTML file. Outputs word- and line-level bounding boxes in JSON with annotated images for images."
|
168 |
)
|
169 |
|
170 |
if __name__ == "__main__":
|