|
import gradio as gr |
|
import pytesseract |
|
import cv2 |
|
import pandas as pd |
|
import re |
|
from PIL import Image |
|
import numpy as np |
|
|
|
def extract_fields(image): |
|
try: |
|
|
|
img = np.array(image.convert("RGB"))[:, :, ::-1] |
|
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
|
bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, |
|
cv2.THRESH_BINARY_INV, 25, 15) |
|
inverted = cv2.bitwise_not(bw) |
|
pil_img = Image.fromarray(inverted) |
|
|
|
|
|
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME) |
|
ocr_df2 = ocr_df2.dropna(subset=["text"]) |
|
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""] |
|
|
|
name = "Not found" |
|
neighbors = [] |
|
|
|
|
|
ocr_df = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DATAFRAME) |
|
ocr_df = ocr_df.dropna(subset=["text"]) |
|
ocr_df = ocr_df[ocr_df["text"].str.strip() != ""] |
|
title_matches = ocr_df[ocr_df['text'].str.lower().str.contains("tit", na=False)] |
|
|
|
if not title_matches.empty: |
|
title_info = title_matches.iloc[0] |
|
|
|
if 'line_num' in title_info and 'block_num' in title_info: |
|
line_num = title_info['line_num'] |
|
block_num = title_info['block_num'] |
|
|
|
same_line = ocr_df[ |
|
(ocr_df['line_num'] == line_num) & |
|
(ocr_df['block_num'] == block_num) |
|
].copy().sort_values(by='left').reset_index(drop=True) |
|
|
|
tit_indices = same_line[same_line['text'].str.lower().str.contains("tit")].index |
|
if not tit_indices.empty: |
|
idx = tit_indices[0] |
|
if idx + 1 < len(same_line): |
|
neighbors.append(same_line.iloc[idx + 1]['text']) |
|
if idx + 2 < len(same_line): |
|
neighbors.append(same_line.iloc[idx + 2]['text']) |
|
|
|
def clean_name(words): |
|
cleaned = [] |
|
for w in words: |
|
w_clean = re.sub(r'^[^a-zA-Z]+|[^a-zA-Z]+$', '', w) |
|
if w_clean: |
|
cleaned.append(w_clean) |
|
return ' '.join(cleaned) |
|
|
|
if neighbors: |
|
name = clean_name(neighbors) |
|
|
|
|
|
ocr_df2 = pytesseract.image_to_data(image, output_type=pytesseract.Output.DATAFRAME) |
|
ocr_df2 = ocr_df2.dropna(subset=["text"]) |
|
ocr_df2 = ocr_df2[ocr_df2["text"].str.strip() != ""] |
|
|
|
def get_value_next_to(keyword, direction="right", max_dist=200): |
|
match = ocr_df2[ocr_df2['text'].str.lower() == keyword.lower()] |
|
if match.empty: |
|
return None |
|
row = match.iloc[0] |
|
if 'line_num' not in row or 'left' not in row: |
|
return None |
|
line = row['line_num'] |
|
x = row['left'] |
|
if direction == "right": |
|
candidates = ocr_df2[ |
|
(ocr_df2['line_num'] == line) & |
|
(ocr_df2['left'] > x) & |
|
(ocr_df2['left'] < x + max_dist) |
|
].sort_values('left') |
|
return candidates['text'].tolist()[0] if not candidates.empty else None |
|
return None |
|
|
|
text = " ".join(ocr_df2['text']) |
|
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text) |
|
phone_match = re.search(r'\+\d{2}\s?\d{2,3}\s?\d{3}\s?\d{2}\s?\d{2}', text) |
|
|
|
raw_text = pytesseract.image_to_string(image, config='--psm 6') |
|
|
|
dob_match = re.search(r'\d{2}\.\d{2}\.\d{4}', raw_text) |
|
dob = dob_match.group(0) if dob_match else "Not found" |
|
|
|
postcode = None |
|
postcode_after_ch = None |
|
ch_exists = bool(re.search(r'\bCH\b', raw_text)) |
|
|
|
lines = raw_text.splitlines() |
|
for line in lines: |
|
if re.search(r'\bCH\b', line): |
|
match = re.search(r'\bCH\b.*?(\d{4})(?![\d/])', line) |
|
if match: |
|
postcode_after_ch = match.group(1) |
|
break |
|
|
|
if postcode_after_ch: |
|
postcode = postcode_after_ch |
|
else: |
|
matches = re.findall(r'(?<!\d|\w)[0-9]{4}(?!\d|\w)', raw_text) |
|
if matches: |
|
postcode = matches[0] |
|
|
|
if not postcode: |
|
postcode = "Not found" |
|
|
|
|
|
def extract_functions_block(): |
|
|
|
func_match = ocr_df2[ocr_df2['text'].str.lower().str.contains("function")] |
|
if func_match.empty: |
|
return [] |
|
|
|
base_y = func_match.iloc[0]['top'] |
|
|
|
func_words = ocr_df2[ |
|
(ocr_df2['top'] > base_y + 10) & (ocr_df2['top'] < base_y + 120) |
|
] |
|
|
|
|
|
func_words = func_words.sort_values(by=["line_num", "left"]) |
|
|
|
grouped_lines = func_words.groupby('line_num')['text'].apply(lambda x: ' '.join(x)).tolist() |
|
|
|
clean_funcs = [] |
|
for line in grouped_lines: |
|
|
|
cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', line).strip() |
|
if len(cleaned) > 1: |
|
clean_funcs.append(cleaned) |
|
return clean_funcs |
|
|
|
functions = extract_functions_block() |
|
|
|
return [ |
|
name if name else "Not found", |
|
email_match.group(0) if email_match else "Not found", |
|
phone_match.group(0) if phone_match else "Not found", |
|
dob, |
|
postcode, |
|
get_value_next_to("CurBase") or "Not found", |
|
get_value_next_to("hourly") or get_value_next_to("rate") or "Not found", |
|
"\n".join(functions) if functions else "Not found" |
|
] |
|
|
|
except Exception as e: |
|
return [f"Error: {str(e)}"] + ["Not found"] * 8 |
|
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("## π Image OCR Field Extractor") |
|
gr.Markdown("Upload a document image to extract structured data fields.") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
image_input = gr.Image(type="pil", label=" Upload Your Document") |
|
submit_btn = gr.Button(" Run Extraction") |
|
|
|
gr.Examples( |
|
examples=["example_doc.jpeg"], |
|
inputs=[image_input], |
|
label=" Example Image (Click to load into uploader)" |
|
) |
|
|
|
with gr.Column(): |
|
name = gr.Text(label="Name") |
|
email = gr.Text(label="Email") |
|
phone = gr.Text(label="Phone") |
|
dob = gr.Text(label="DOB") |
|
postcode = gr.Text(label="Postcode") |
|
prem = gr.Text(label="Prem (CurBase)") |
|
rate = gr.Text(label="Temp (Hourly Rate)") |
|
functions = gr.Textbox(label="Functions", lines=4) |
|
|
|
submit_btn.click(fn=extract_fields, inputs=image_input, |
|
outputs=[name, email, phone, dob, postcode, prem, rate, functions]) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|
|
|