rahul7star commited on
Commit
eeb0aa9
·
verified ·
1 Parent(s): 065f9a9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -45
app.py CHANGED
@@ -16,10 +16,10 @@ processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
16
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
17
  reader = easyocr.Reader(['en'])
18
 
19
- def extract_images_from_html(html_file):
20
- """Extract images from HTML file (base64 or URLs)"""
21
  images = []
22
- soup = BeautifulSoup(html_file.read(), "html.parser")
23
  for img_tag in soup.find_all("img"):
24
  src = img_tag.get("src")
25
  if not src:
@@ -39,7 +39,18 @@ def extract_images_from_html(html_file):
39
 
40
  def parse_html_text(html_file):
41
  """Parse HTML text and generate approximate bounding boxes"""
42
- html_content = html_file.read().decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
43
  soup = BeautifulSoup(html_content, "html.parser")
44
  body_text = soup.get_text(separator="\n")
45
  lines = [line.strip() for line in body_text.split("\n") if line.strip()]
@@ -79,10 +90,11 @@ def parse_html_text(html_file):
79
 
80
  output_json = {
81
  "words": words_json,
82
- "lines": lines_json
 
83
  }
84
 
85
- return html_content, output_json
86
 
87
  def load_image(image_file, image_url):
88
  if image_file:
@@ -95,61 +107,59 @@ def load_image(image_file, image_url):
95
  def detect_text_combined(image_file, image_url, html_file):
96
  # HTML path
97
  if html_file:
98
- html_content, output_json = parse_html_text(html_file)
99
  json_str = json.dumps(output_json, indent=2)
100
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
101
  tmp_file.write(json_str)
102
  tmp_file.close()
103
- return html_content, json_str, tmp_file.name
 
 
 
 
104
 
105
  # Image path
106
  images = load_image(image_file, image_url)
107
  if not images:
108
  return None, "No input provided.", None
109
 
110
- all_output_json = []
111
- annotated_images = []
112
-
113
- for image in images:
114
- results = reader.readtext(np.array(image))
115
- draw = ImageDraw.Draw(image)
116
- words_json = []
117
-
118
- for bbox, _, conf in results:
119
- x_coords = [float(point[0]) for point in bbox]
120
- y_coords = [float(point[1]) for point in bbox]
121
- x_min, y_min = min(x_coords), min(y_coords)
122
- x_max, y_max = max(x_coords), max(y_coords)
123
-
124
- # Crop word for TrOCR recognition
125
- word_crop = image.crop((x_min, y_min, x_max, y_max))
126
- pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
127
- generated_ids = model.generate(pixel_values)
128
- text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
129
 
130
- draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
 
 
 
 
131
 
132
- words_json.append({
133
- "text": text,
134
- "bbox": [x_min, y_min, x_max, y_max],
135
- "confidence": float(conf)
136
- })
137
 
138
- paragraphs_json = words_json.copy()
139
- output_json = {
140
- "words": words_json,
141
- "paragraphs": paragraphs_json
142
- }
143
- json_str = json.dumps(output_json, indent=2)
144
 
145
- tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
146
- tmp_file.write(json_str)
147
- tmp_file.close()
 
 
148
 
149
- annotated_images.append((image, json_str, tmp_file.name))
 
 
 
 
 
 
 
 
150
 
151
- # Return first image for simplicity (can extend to gallery)
152
- return annotated_images[0]
153
 
154
  iface = gr.Interface(
155
  fn=detect_text_combined,
 
16
  model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
17
  reader = easyocr.Reader(['en'])
18
 
19
+ def extract_images_from_html(html_content):
20
+ """Extract images from HTML content (base64 or URLs)"""
21
  images = []
22
+ soup = BeautifulSoup(html_content, "html.parser")
23
  for img_tag in soup.find_all("img"):
24
  src = img_tag.get("src")
25
  if not src:
 
39
 
40
  def parse_html_text(html_file):
41
  """Parse HTML text and generate approximate bounding boxes"""
42
+ # Handle different Gradio file types
43
+ if hasattr(html_file, "read"):
44
+ html_content = html_file.read()
45
+ if isinstance(html_content, bytes):
46
+ html_content = html_content.decode("utf-8")
47
+ else:
48
+ # NamedString object (Gradio v3.40+)
49
+ html_content = str(html_file)
50
+
51
+ # Extract images from HTML (optional, for OCR later)
52
+ images_in_html = extract_images_from_html(html_content)
53
+
54
  soup = BeautifulSoup(html_content, "html.parser")
55
  body_text = soup.get_text(separator="\n")
56
  lines = [line.strip() for line in body_text.split("\n") if line.strip()]
 
90
 
91
  output_json = {
92
  "words": words_json,
93
+ "lines": lines_json,
94
+ "images_found": len(images_in_html)
95
  }
96
 
97
+ return html_content, output_json, images_in_html
98
 
99
  def load_image(image_file, image_url):
100
  if image_file:
 
107
  def detect_text_combined(image_file, image_url, html_file):
108
  # HTML path
109
  if html_file:
110
+ html_content, output_json, images_in_html = parse_html_text(html_file)
111
  json_str = json.dumps(output_json, indent=2)
112
  tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
113
  tmp_file.write(json_str)
114
  tmp_file.close()
115
+ annotated_image = None
116
+ if images_in_html:
117
+ # For demo, show first extracted image if exists
118
+ annotated_image = images_in_html[0]
119
+ return annotated_image, json_str, tmp_file.name
120
 
121
  # Image path
122
  images = load_image(image_file, image_url)
123
  if not images:
124
  return None, "No input provided.", None
125
 
126
+ annotated_image = images[0]
127
+ image = annotated_image
128
+ results = reader.readtext(np.array(image))
129
+ draw = ImageDraw.Draw(image)
130
+ words_json = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
+ for bbox, _, conf in results:
133
+ x_coords = [float(point[0]) for point in bbox]
134
+ y_coords = [float(point[1]) for point in bbox]
135
+ x_min, y_min = min(x_coords), min(y_coords)
136
+ x_max, y_max = max(x_coords), max(y_coords)
137
 
138
+ # Crop word for TrOCR recognition
139
+ word_crop = image.crop((x_min, y_min, x_max, y_max))
140
+ pixel_values = processor(images=word_crop, return_tensors="pt").pixel_values
141
+ generated_ids = model.generate(pixel_values)
142
+ text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
143
 
144
+ draw.rectangle([x_min, y_min, x_max, y_max], outline="red", width=2)
 
 
 
 
 
145
 
146
+ words_json.append({
147
+ "text": text,
148
+ "bbox": [x_min, y_min, x_max, y_max],
149
+ "confidence": float(conf)
150
+ })
151
 
152
+ paragraphs_json = words_json.copy()
153
+ output_json = {
154
+ "words": words_json,
155
+ "paragraphs": paragraphs_json
156
+ }
157
+ json_str = json.dumps(output_json, indent=2)
158
+ tmp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w")
159
+ tmp_file.write(json_str)
160
+ tmp_file.close()
161
 
162
+ return annotated_image, json_str, tmp_file.name
 
163
 
164
  iface = gr.Interface(
165
  fn=detect_text_combined,