Adignite commited on
Commit
5eab8c7
·
verified ·
1 Parent(s): c253704

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import cv2
3
+ import numpy as np
4
+ import easyocr
5
+ import re
6
+ from langdetect import detect_langs
7
+ from PIL import Image
8
+ import io
9
+
10
+ def load_easyocr_reader():
11
+ return easyocr.Reader(['hi', 'en'], gpu=False)
12
+
13
+ def preprocess_image(image):
14
+ img_array = np.array(image.convert('RGB'))
15
+ gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
16
+ denoised = cv2.fastNlMeansDenoising(gray, None, 10, 7, 21)
17
+ thresh = cv2.adaptiveThreshold(denoised, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
18
+ kernel = np.ones((1, 1), np.uint8)
19
+ dilated = cv2.dilate(thresh, kernel, iterations=1)
20
+ return dilated
21
+
22
+ def perform_easyocr(image, reader):
23
+ preprocessed_image = preprocess_image(image)
24
+ results = reader.readtext(preprocessed_image, paragraph=True, detail=0,
25
+ contrast_ths=0.2, adjust_contrast=0.5,
26
+ add_margin=0.1, width_ths=0.7, height_ths=0.7)
27
+ extracted_text = ' '.join(results)
28
+ return extracted_text
29
+
30
+ def detect_languages(text):
31
+ cleaned_text = re.sub(r'[^a-zA-Z\u0900-\u097F\s]', '', text)
32
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
33
+
34
+ if not cleaned_text:
35
+ return []
36
+ try:
37
+ langs = detect_langs(cleaned_text)
38
+ detected = []
39
+ for lang in langs:
40
+ if lang.lang == 'hi' and lang.prob > 0.1:
41
+ detected.append('Hindi')
42
+ elif lang.lang == 'en' and lang.prob > 0.1:
43
+ detected.append('English')
44
+ return detected
45
+ except:
46
+ return fallback_language_check(cleaned_text)
47
+
48
+ def fallback_language_check(text):
49
+ hindi_range = range(0x0900, 0x097F)
50
+ english_range = range(0x0041, 0x007A)
51
+
52
+ has_hindi = any(ord(char) in hindi_range for char in text)
53
+ has_english = any(ord(char) in english_range for char in text)
54
+
55
+ detected = []
56
+ if has_hindi:
57
+ detected.append('Hindi')
58
+ if has_english:
59
+ detected.append('English')
60
+
61
+ return detected
62
+
63
+ def highlight_text(text, keywords):
64
+ for keyword in keywords:
65
+ # Highlight the keyword by wrapping it with a span tag
66
+ text = re.sub(f'({re.escape(keyword)})', r'<span style="background-color: yellow;">\1</span>', text, flags=re.IGNORECASE)
67
+ return text
68
+
69
+ def main():
70
+ st.title("OCR for Hindi and English")
71
+
72
+ easyocr_reader = load_easyocr_reader()
73
+
74
+ # Initialize session state variables
75
+ if 'extracted_text' not in st.session_state:
76
+ st.session_state.extracted_text = ""
77
+
78
+ uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
79
+ if uploaded_file is not None:
80
+ image = Image.open(uploaded_file)
81
+ st.image(image, caption='Uploaded Image', use_column_width=True)
82
+
83
+ if st.button('Perform OCR'):
84
+ with st.spinner('Processing...'):
85
+ st.session_state.extracted_text = perform_easyocr(image, easyocr_reader)
86
+
87
+ st.subheader("Extracted Text:")
88
+ st.write(st.session_state.extracted_text)
89
+
90
+ languages_detected = detect_languages(st.session_state.extracted_text)
91
+ if languages_detected:
92
+ st.write("Detected languages:", ', '.join(languages_detected))
93
+ else:
94
+ st.write("No languages detected.")
95
+
96
+ if st.session_state.extracted_text: # Check if OCR has been performed
97
+ st.subheader("Search in Extracted Text")
98
+ search_query = st.text_input("Enter keywords to search:", "")
99
+ if search_query:
100
+ keywords = search_query.split()
101
+ highlighted_text = highlight_text(st.session_state.extracted_text, keywords)
102
+ st.markdown(highlighted_text, unsafe_allow_html=True)
103
+
104
+ if __name__ == "__main__":
105
+ main()