Spaces:
Running
Running
File size: 7,832 Bytes
1195c20 4498e1f 1195c20 0a250e6 1195c20 eb6ab47 1195c20 eb6ab47 1195c20 eb6ab47 1195c20 eb6ab47 1195c20 eb6ab47 1195c20 62b0def |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import gradio as gr
import cv2
import numpy as np
import pytesseract
import re
import google.generativeai as genai
from rapidfuzz.distance import Levenshtein
import os
os.system('apt-get update && apt-get install -y tesseract-ocr')
# Configure Generative AI
OPENAI_API_KEY = os.getenv("API_KEY")
genai.configure(api_key=OPENAI_API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash")
# Image processing functions
def threshold_image(img, threshold_value=None):
if threshold_value is None: # Adaptive thresholding
thresholded_image = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 11, 2)
else: # Manual thresholding
_, thresholded_image = cv2.threshold(img, threshold_value, 255, cv2.THRESH_BINARY)
return thresholded_image
def bm3d_denoising(img, sigma_psd=55):
return cv2.fastNlMeansDenoising(img, None, sigma_psd)
def remove_noise(img, kernel_size=3):
kernel = np.ones((kernel_size, kernel_size), np.float32) / (kernel_size**2)
denoised = cv2.filter2D(img, -1, kernel)
return cv2.medianBlur(denoised, 3)
def sharpen_image(img):
kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
return cv2.filter2D(img, -1, kernel)
def remove_extra_spaces_and_lines(text):
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'\n\s*\n', '\n\n', text)
return text
def calculate_accuracy(text1, text2):
# matcher = difflib.SequenceMatcher(None, generated_text, transcribed_text)
# return matcher.ratio()
distance = Levenshtein.distance(text1, text2)
max_length = max(len(text1), len(text2))
accuracy = (1 - (distance / max_length))
return accuracy
# Gradio app
def process_image(image, threshold_value=None, correct_transcription=None):
img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Process the image
thresholded = threshold_image(img, threshold_value)
bm3d_denoised_image = bm3d_denoising(thresholded)
denoised = remove_noise(thresholded)
sharpened_image = sharpen_image(bm3d_denoised_image)
# OCR
original_text = pytesseract.image_to_string(img)
thresholded_text = pytesseract.image_to_string(thresholded)
bm3d_denoised_text = pytesseract.image_to_string(bm3d_denoised_image)
denoised_text = pytesseract.image_to_string(denoised)
sharpened_text = pytesseract.image_to_string(sharpened_image)
# Clean up text
original_text = remove_extra_spaces_and_lines(original_text)
thresholded_text = remove_extra_spaces_and_lines(thresholded_text)
bm3d_denoised_text = remove_extra_spaces_and_lines(bm3d_denoised_text)
denoised_text = remove_extra_spaces_and_lines(denoised_text)
sharpened_text = remove_extra_spaces_and_lines(sharpened_text)
# Generative AI model response
user_prompt = user_prompt = f"""
below are the output texts of OCR on multiple image processing techniques of a faded image with text written in English, can you use all the texts to predict the original text, provide only the text.
Pre-Processing Image Text:
{original_text}
Sharpened Image Text:
{sharpened_text}
Thresholded Image Text:
{thresholded_text}
BM3D Denoised Image Text:
{bm3d_denoised_text}
Denoised Image Text:
{denoised_text}
"""
response = model.generate_content(user_prompt)
model_text = response.text
if not correct_transcription:
correct_transcription = model_text
# Accuracy metrics
if correct_transcription:
original_accuracy = calculate_accuracy(original_text, correct_transcription)
thresholded_accuracy = calculate_accuracy(thresholded_text, correct_transcription)
bm3d_denoised_accuracy = calculate_accuracy(bm3d_denoised_text, correct_transcription)
denoised_accuracy = calculate_accuracy(denoised_text, correct_transcription)
sharpened_accuracy = calculate_accuracy(sharpened_text, correct_transcription)
model_accuracy = calculate_accuracy(model_text, correct_transcription)
accuracy_metrics = f"""
Original Image Accuracy: {original_accuracy:.2%}
Thresholded Image Accuracy: {thresholded_accuracy:.2%}
BM3D Denoised Image Accuracy: {bm3d_denoised_accuracy:.2%}
Denoised Image Accuracy: {denoised_accuracy:.2%}
Sharpened Image Accuracy: {sharpened_accuracy:.2%}
Model Response Accuracy: {model_accuracy:.2%}
"""
else:
accuracy_metrics = "No correct transcription provided."
# Return results
return (
image, thresholded, bm3d_denoised_image, denoised, sharpened_image,
original_text, thresholded_text, bm3d_denoised_text, denoised_text, sharpened_text,
model_text, accuracy_metrics
)
# Interface
with gr.Blocks() as demo:
with gr.Row():
gr.Markdown("## Faded text restoration")
with gr.Row():
gr.Markdown("""
### Legend
- **Model Response**: Text generated by the Generative AI model.
- **Accuracy Metrics**: Comparison of OCR results with the provided correct transcription if provided, otherwise with the model response.
""")
with gr.Row():
with gr.Column():
image_input = gr.Image(label="Upload Image", type="numpy")
threshold_slider = gr.Slider(label="Threshold Value", minimum=0, maximum=255, step=1, value=242)
adaptive_checkbox = gr.Checkbox(label="Use Adaptive Thresholding", value=False)
transcription_input = gr.Textbox(label="Correct Transcription (Optional)")
process_button = gr.Button("Process Image")
with gr.Column():
tabs = gr.Tabs()
with tabs:
with gr.TabItem("Original"):
original_image_display = gr.Image(label="Original Image")
original_text_display = gr.Textbox(label="Original Image Text", lines=5)
with gr.TabItem("Thresholded"):
thresholded_image_display = gr.Image(label="Thresholded Image")
thresholded_text_display = gr.Textbox(label="Thresholded Image Text", lines=5)
with gr.TabItem("BM3D Denoised"):
bm3d_denoised_image_display = gr.Image(label="BM3D Denoised Image")
bm3d_denoised_text_display = gr.Textbox(label="BM3D Denoised Image Text", lines=5)
with gr.TabItem("Denoised"):
denoised_image_display = gr.Image(label="Denoised Image")
denoised_text_display = gr.Textbox(label="Denoised Image Text", lines=5)
with gr.TabItem("Sharpened"):
sharpened_image_display = gr.Image(label="Sharpened Image")
sharpened_text_display = gr.Textbox(label="Sharpened Image Text", lines=5)
accuracy_output = gr.Textbox(label="Accuracy Metrics")
model_text_display = gr.Textbox(label="Model Response Text")
# Link button to processing function
def update_process(image, threshold_value, use_adaptive, correct_transcription):
threshold_value = None if use_adaptive else threshold_value
return process_image(image, threshold_value, correct_transcription)
process_button.click(
update_process,
inputs=[image_input, threshold_slider, adaptive_checkbox, transcription_input],
outputs=[
original_image_display, thresholded_image_display,
bm3d_denoised_image_display, denoised_image_display,
sharpened_image_display, original_text_display,
thresholded_text_display, bm3d_denoised_text_display,
denoised_text_display, sharpened_text_display,
model_text_display, accuracy_output
],
)
# Launch app
demo.launch() |