File size: 4,151 Bytes
bcb8309
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import re
import streamlit as st
from transformers import AutoModel, AutoTokenizer
import io
from PIL import Image

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained('srimanth-d/GOT_CPU', trust_remote_code=True)
    model = AutoModel.from_pretrained("srimanth-d/GOT_CPU", trust_remote_code=True, low_cpu_mem_usage=True, use_safetensors=True, pad_token_id=151643)
    model.eval()
    return model, tokenizer

def handle_error(error_message):
    #logging.error(error_message)
    st.error(f"An error occurred: {error_message}")

def extract_text(image_bytes, ocr_type):
    try:
        model, tokenizer = load_model()
        image = Image.open(io.BytesIO(image_bytes))
        image.save("temp_image.png", format="PNG")
        res = model.chat(tokenizer, "temp_image.png", ocr_type=ocr_type)
        return res
    except Exception as e:
        handle_error(f"Error during OCR extraction: {str(e)}")
        return None

def search_keyword(extracted_text, keyword):
    keyword = re.escape(keyword)
    regex_pattern = rf'\b({keyword})\b'
    occurrences = len(re.findall(regex_pattern, extracted_text, flags=re.IGNORECASE))
    highlighted_text = re.sub(regex_pattern, r"<span style='color:red'><b>\1</b></span>", extracted_text, flags=re.IGNORECASE)
    return highlighted_text, occurrences

@st.cache_data
def cache_image_ocr(image_bytes, ocr_type):
    return extract_text(image_bytes, ocr_type)

def app():
    st.set_page_config(page_title="OCR Tool", layout="wide", page_icon=":chart_with_upwards_trend:")
    st.header("Optical Character Recognition for English and Hindi Texts")
    st.write("Upload an image below for OCR:")

    if 'extracted_text' not in st.session_state:
        st.session_state.extracted_text = None

    col1, col2 = st.columns([1, 1])

    with col1:
        st.subheader("Upload and OCR Extraction")
        uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png", "jpeg"], accept_multiple_files=False)
        
        # Add OCR type selection dropdown
        ocr_type = st.selectbox("Select OCR Type:", ["ocr", "format"])
        
        if uploaded_file is not None:
            st.image(uploaded_file, caption='Uploaded Image', use_column_width=True)
            image_bytes = uploaded_file.read()

            if st.session_state.extracted_text is None:
                with st.spinner("Extracting the text..."):
                    extracted_text = cache_image_ocr(image_bytes, ocr_type)
                    
                    if extracted_text:
                        st.success("Text extraction completed!", icon="πŸŽ‰")
                        st.session_state.extracted_text = extracted_text
                        st.write("Extracted Text:")
                        st.write(extracted_text)
                    else:
                        st.error("Failed to extract text. Please try with a different image.")
            else:
                st.write("Extracted Text:")
                st.write(st.session_state.extracted_text)
        else:
            st.session_state.extracted_text = None
            st.info("Please upload an image file to proceed.")

    with col2:
        st.subheader("Keyword Search")
        
        if st.session_state.extracted_text:
            keyword = st.text_input("Enter keyword to search")

            if keyword:
                with st.spinner(f"Searching for '{keyword}'..."):
                    highlighted_text, occurrences = search_keyword(st.session_state.extracted_text, keyword)

                    if occurrences > 0:
                        st.success(f"Found {occurrences} occurrences of the keyword '{keyword}'!")
                        st.markdown(highlighted_text, unsafe_allow_html=True)
                    else:
                        st.warning(f"No occurrences of the keyword '{keyword}' were found.")
        else:
            st.info("Please upload an image and extract text first.")

def main():
    try:
        app()
    except Exception as main_error:
        handle_error(f"Unexpected error in the main function: {str(main_error)}")

if __name__ == "__main__":
    main()