Spaces:

Macro27
/

OCR

Build error

App Files Files Community

Marco commited on Aug 26, 2024

Commit

3bc9036

1 Parent(s): 1e51b6c

initial commit

Browse files

Files changed (14) hide show

.gitignore +16 -0
LICENSE.txt +21 -0
README.md +25 -13
main.py +66 -0
packages.txt +1 -0
requirements.txt +79 -0
src/__init__.py +1 -0
src/chatbot.py +50 -0
src/ocr.py +19 -0
src/perplexity_api.py +67 -0
src/st_context.py +33 -0
src/system_initializer.py +23 -0
src/utilities.py +127 -0
src/video_processor.py +41 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+# Ignore python build artifacts
+src/__pycache__/
+# Ignore virtual environment
+venv/
+# Ignore vscode files
+.vscode/
+# Ignore streamlit secrets
+.streamlit/secrets.toml
+images/
+test.py

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Marco Lee
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,25 @@
----
-title: OCR
-emoji: 😻
-colorFrom: red
-colorTo: blue
-sdk: streamlit
-sdk_version: 1.37.1
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# streamlit_ocr
+## Streamlit OCR Application
+This project is a Streamlit-based Optical Character Recognition (OCR) application that allows users to extract text from images using various OCR engines.
+## Features
+* live camera feed with OCR support
+* display of extracted text on screen
+* can change confidence score required for displaying extracted text
+* freeze button to perform chatbot inference on most recent text
+## Deployment
+<a href="https://st-ocr.streamlit.app/">Streamlit OCR + chatbot app</a>
+### TODOS
+- [ ] Less clunky freezing
+- [ ] Fix bugs
+### Completed tasks ✓
+- [x] Implement OCR capability
+- [x] Implement freeze function
+- [x] Implement chatbot API call
+- [x] Integrate code with streamlit
+- [x] Faster OCR
+- [x] Multilingual support

main.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import streamlit as st
+from streamlit_webrtc import webrtc_streamer, WebRtcMode
+from src.system_initializer import SystemInitializer
+from src.video_processor import VideoProcessor
+from src.utilities import Utilities
+import warnings
+warnings.filterwarnings("ignore")  # Ignore warnings for a cleaner output
+class OCRChatbotApp:
+    def __init__(self):
+        # Initialize the system components
+        self.system_initializer = SystemInitializer()
+        self.utilities = Utilities()
+        self.queues = self.system_initializer.initialize_system()  # Initialize queues and start OCR thread
+        self.conf_thresh = 50  # Default confidence threshold for OCR
+        self.n = 5  # Process every n frames
+        self.k = 30  # Number of frames to keep annotations
+    def run(self):
+        st.title('OCR and Chatbot Application')  # Set the title of the Streamlit app
+        # Initialize session state variables if they don't exist
+        if "camera_frozen" not in st.session_state:
+            st.session_state.update({"camera_frozen": False, "latest": [], "likely_text": ""})
+        # Create sliders for adjusting confidence threshold and frame processing interval
+        self.conf_thresh = st.slider('Confidence Threshold', 0, 100, 50)
+        self.n = st.slider('Process every n frames', 1, 30, 5)
+        # Button to freeze or resume the camera
+        if st.button("Freeze" if not st.session_state.camera_frozen else "Resume"):
+            st.session_state.camera_frozen = not st.session_state.camera_frozen
+            if st.session_state.camera_frozen:
+                st.session_state.likely_text = self.utilities.fetch_likely_text()  # Fetch likely text when camera is frozen
+        # Define constraints for higher resolution video capture
+        constraints = {
+            "video": {
+                "width": {"ideal": 1280},
+                "height": {"ideal": 720},
+                "frameRate": {"ideal": 30}
+            },
+            "audio": False
+        }
+        # Initialize the WebRTC streamer with the specified constraints and video processor
+        webrtc_ctx = webrtc_streamer(
+            key="example",
+            video_processor_factory=lambda: VideoProcessor(self.queues, self.conf_thresh, self.n, self.k),
+            media_stream_constraints=constraints,
+            async_processing=True,
+        )
+        # Display the likely text if the camera is frozen
+        if st.session_state.camera_frozen and st.session_state.likely_text:
+            st.write(st.session_state.likely_text)
+        else:
+            st.write("No text found")
+        # Run the chatbot
+        self.system_initializer.run_chatbot()
+if __name__ == '__main__':
+    app = OCRChatbotApp()  # Create an instance of the OCRChatbotApp
+    app.run()  # Run the app

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libgl1-mesa-glx

requirements.txt ADDED Viewed

	@@ -0,0 +1,79 @@

+aioice==0.9.0
+aiortc==1.9.0
+altair==5.3.0
+attrs==23.2.0
+av==12.3.0
+blinker==1.8.2
+cachetools==5.4.0
+certifi==2024.7.4
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cryptography==43.0.0
+deskew==1.5.1
+dnspython==2.6.1
+easyocr==1.7.1
+filelock==3.15.4
+fsspec==2024.6.1
+gitdb==4.0.11
+GitPython==3.1.43
+google-crc32c==1.5.0
+idna==3.7
+ifaddr==0.2.0
+imageio==2.34.2
+inexactsearch==1.0.2
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+lazy_loader==0.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.3
+ninja==1.11.1.1
+numpy==2.0.1
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+packaging==24.1
+pandas==2.2.2
+pillow==10.4.0
+protobuf==5.27.2
+pyarrow==17.0.0
+pyclipper==1.3.0.post5
+pycparser==2.22
+pydeck==0.9.1
+pyee==11.1.0
+Pygments==2.18.0
+pylibsrtp==0.10.0
+pyOpenSSL==24.2.1
+python-bidi==0.5.1
+python-dateutil==2.9.0.post0
+pytz==2024.1
+PyYAML==6.0.1
+referencing==0.35.1
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.19.0
+scikit-image==0.24.0
+scipy==1.14.0
+setuptools==71.1.0
+shapely==2.0.5
+silpa_common==0.3
+six==1.16.0
+smmap==5.0.1
+soundex==1.1.3
+spellchecker==0.4
+streamlit==1.36.0
+streamlit-webrtc==0.47.7
+sympy==1.13.1
+tenacity==8.5.0
+tifffile==2024.7.21
+toml==0.10.2
+toolz==0.12.1
+torch==2.3.1
+torchvision==0.18.1
+tornado==6.4.1
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.2

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # __init__.py

src/chatbot.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import streamlit as st
+import sys
+import os
+# Add the directory containing this script to the Python path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+def chatbot_say(message):
+    with st.chat_message("assistant", avatar="🔮"):
+        st.markdown(message)
+        # Add chatbot response to chat history
+        st.session_state.chat_history.append(("assistant", message))
+def run_chatbot():
+    from perplexity_api import chat_completion
+    # Initialise chat history in session state
+    if 'chat_history' not in st.session_state:
+        st.session_state.chat_history = []
+    # Display chat history
+    for role, content in st.session_state.chat_history:
+        with st.chat_message(role):
+            st.markdown(content)
+    # If user has entered a message, add it to chat history and get chatbot response
+    if prompt := st.chat_input("Say something: "):
+        # Display user message
+        with st.chat_message("user"):
+            st.markdown(prompt)
+        # Add user message to chat history
+        st.session_state.chat_history.append(("user", prompt))
+        info = "none"
+        if st.session_state.latest:
+            info = st.session_state.latest
+        response = chat_completion(prompt, info, mode="normal")
+        # Display chatbot response
+        with st.chat_message("assistant", avatar="🔮"):
+            st.markdown(response)
+        # Add chatbot response to chat history
+        st.session_state.chat_history.append(("assistant", response))

src/ocr.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Perform OCR in a separate thread
+def ocr_thread(frame_queue, text_queue):
+    from easyocr import Reader
+    reader = Reader(lang_list=['en'])
+    while True:
+        frame = frame_queue.get()
+        # If queue is empty, exit the loop
+        if frame is None:
+            break
+        texts = reader.readtext(frame)
+        text_queue.put(texts)
+        print(texts)
+if __name__ == "__main__":
+    import sys
+    print(sys.path)

src/perplexity_api.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import requests
+import streamlit as st
+# API_KEY = st.secrets["API_KEY"]
+API_KEY = "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"
+url = "https://api.perplexity.ai/chat/completions"
+ppx_prompt = '''Given a Python list named latest_ocr_values, such as ['I am a Python', 'I a pthon',
+'I python', 'I am a python', 'a'], directly provide the text that is most consistently detected by the OCR.
+If multiple answers are possible, choose the most likely one only. If there is no clear answer, state 'None'.
+The output must follow the format: 'OCR scanned text: (your_answer)'. Do not explain yourself afterwards, do not include
+multiple valid outputs. Do not include any other information.'''
+normal_prompt = '''Be a good assistant and answer my question, using information from the following prompt or relating to it, as well as
+knowledge you have about this prompt. If no information is given or if the question is not relevant to the information given,
+simply answer as normal, using any knowledge you have.'''
+headers = {
+    "Authorization": "Bearer " + API_KEY,
+    "accept": "application/json",
+    "content-type": "application/json"
+}
+payload = {
+    "model": "mistral-7b-instruct",
+    "messages": [
+        {
+            "role": "system",
+            "content": ""
+        },
+        {
+            "role": "user",
+            "content": ""
+        }
+    ]
+}
+# Perform a chat completion in a separate thread
+def chat_completion(prompt, info="", mode="normal"):
+    while True:
+        # If queue is empty, exit the loop
+        if not prompt:
+            break
+        # Copy payload and insert prompt
+        pl = payload.copy()
+        if mode == "ocr":
+            pl["messages"][0]["content"] = ppx_prompt
+        elif mode == "normal":
+            pl["messages"][0]["content"] = normal_prompt
+        pl["messages"][1]["content"] = f"Prompt: {prompt}. Information: {info}."
+        # Perform chat completion and add results to queue
+        response = requests.post(url, json=pl, headers=headers)
+        if response.status_code == 200:
+            response_data = response.json()
+            output = response_data.get("choices")[0].get("message").get("content")
+            return output
+        else:
+            return "Error"
+if __name__ == "__main__":
+    print(chat_completion("latest_ocr_values = ['I am a cat', 'I a cat', 'I cat', 'I am cat', 'a']"))

src/st_context.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import threading
+from typing import Any, TypeVar, cast
+from streamlit.errors import NoSessionContext
+from streamlit.runtime.scriptrunner.script_run_context import SCRIPT_RUN_CONTEXT_ATTR_NAME, get_script_run_ctx
+T = TypeVar("T")
+def with_streamlit_context(fn: T) -> T:
+    """Fix bug in streamlit which raises streamlit.errors.NoSessionContext."""
+    ctx = get_script_run_ctx()
+    if ctx is None:
+        raise NoSessionContext(
+            "with_streamlit_context must be called inside a context; "
+            "construct your function on the fly, not earlier."
+        )
+    def _cb(*args: Any, **kwargs: Any) -> Any:
+        """Do it."""
+        thread = threading.current_thread()
+        do_nothing = hasattr(thread, SCRIPT_RUN_CONTEXT_ATTR_NAME) and (
+            getattr(thread, SCRIPT_RUN_CONTEXT_ATTR_NAME) == ctx
+        )
+        if not do_nothing:
+            setattr(thread, SCRIPT_RUN_CONTEXT_ATTR_NAME, ctx)
+        # Call the callback.
+        ret = fn(*args, **kwargs)
+        if not do_nothing:
+            # Why delattr? Because tasks for different users may be done by
+            # the same thread at different times. Danger danger.
+            delattr(thread, SCRIPT_RUN_CONTEXT_ATTR_NAME)
+        return ret
+    return cast(T, _cb)

src/system_initializer.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import queue
+import threading
+from streamlit.runtime.scriptrunner import add_script_run_ctx
+from src.ocr import ocr_thread
+from src.chatbot import run_chatbot
+class SystemInitializer:
+    def initialize_system(self):
+        """Initializes queues and starts OCR thread."""
+        queues = {
+            'frame_queue': queue.Queue(maxsize=1),
+            'text_queue': queue.Queue(maxsize=1),
+            'annotation_queue': queue.Queue(maxsize=1),
+            'prompt_queue': queue.Queue(maxsize=1),
+            'ppx_queue': queue.Queue(maxsize=1)
+        }
+        ocr_thread_with_ctx = threading.Thread(target=ocr_thread, args=(queues['frame_queue'], queues['text_queue']))
+        add_script_run_ctx(ocr_thread_with_ctx)
+        ocr_thread_with_ctx.start()
+        return queues
+    def run_chatbot(self):
+        run_chatbot()

src/utilities.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import cv2
+import numpy as np
+from deskew import determine_skew
+from spellchecker import SpellChecker
+import streamlit as st
+from src.perplexity_api import chat_completion
+from src.st_context import with_streamlit_context
+class Utilities:
+    def __init__(self):
+        # Initialize the spell checker
+        self.spell = SpellChecker()
+    def fetch_likely_text(self):
+        """Fetches likely text based on latest OCR values."""
+        # Use the chat_completion function to fetch the latest OCR values from the session state
+        return chat_completion(f"latest_ocr_values = {st.session_state['latest']}")
+    @with_streamlit_context
+    def detect_annotations(self, frame, text_queue, conf_thresh):
+        """Detects annotations for a single video frame."""
+        # If the text queue is empty, return an empty list
+        if text_queue.empty():
+            return []
+        # Get detections from the text queue
+        detections = text_queue.get()
+        annotations = []
+        for (box, text, confidence) in detections:
+            # Only consider detections with confidence above the threshold
+            if confidence > conf_thresh / 100.0:
+                # Correct the spelling of the detected text
+                corrected_text = self.correct_spelling(text)
+                # Append the bounding box and corrected text to annotations
+                annotations.append((box, corrected_text))
+        return annotations
+    @with_streamlit_context
+    def draw_annotations(self, frame, annotations):
+        """Draws annotations on the frame."""
+        for (box, text) in annotations:
+            try:
+                # Calculate the size of the text box
+                font = cv2.FONT_HERSHEY_SIMPLEX
+                font_scale = 1
+                thickness = 2
+                text_size, baseline = cv2.getTextSize(text, font, font_scale, thickness)
+                text_width, text_height = text_size
+                # Calculate the position for the rectangle and text
+                p1 = (int(box[0][0]), int(box[0][1]))
+                p2 = (p1[0] + text_width, p1[1] - text_height - baseline)
+                # Draw a filled rectangle with transparency
+                overlay = frame.copy()
+                cv2.rectangle(overlay, p1, p2, (0, 255, 0), -1)
+                alpha = 0.4  # Transparency factor
+                cv2.addWeighted(overlay, alpha, frame, 1 - alpha, 0, frame)
+                # Put the text on top of the rectangle
+                cv2.putText(frame, text, (p1[0], p1[1] - baseline), font, font_scale, (0, 0, 0), thickness)
+            except Exception as e:
+                # Log an error message if annotation fails
+                st.error(f"Failed to annotate frame: {e}")
+        return frame
+    def _grayscale(self, image):
+        """Converts the image to grayscale."""
+        if len(image.shape) == 3:
+            gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = image
+        return gray
+    def _remove_noise(self, image):
+        """Removes noise from the image using Non-Local Means Denoising."""
+        return cv2.fastNlMeansDenoising(image, None, 10, 7, 21)
+    def _enhance_contrast(self, image):
+        """Enhances the contrast of the image using CLAHE."""
+        clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
+        enhanced = clahe.apply(image)
+        return enhanced
+    def _deskew(self, image):
+        """Deskews the image assuming the text is horizontal."""
+        angle = determine_skew(image)
+        (h, w) = image.shape[:2]
+        center = (w // 2, h // 2)
+        # Get the rotation matrix
+        M = cv2.getRotationMatrix2D(center, angle, 1.0)
+        # Perform the actual rotation and return the image
+        deskewed = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
+        return deskewed
+    def _binarize(self, image):
+        """Converts the image to a binary image using Otsu's binarization."""
+        _, binary = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+        return binary
+    def preprocess_image(self, image):
+        """Preprocesses the image by enhancing contrast, removing noise, and deskewing."""
+        gray = self._grayscale(image)
+        denoised = self._remove_noise(gray)
+        enhanced = self._enhance_contrast(denoised)
+        deskewed = self._deskew(enhanced)
+        binary = self._binarize(deskewed)
+        return binary
+    def correct_spelling(self, text):
+        """Corrects the spelling of the given text."""
+        corrected_text = []
+        for word in text.split():
+            corrected_word = self.spell.correction(word)
+            if corrected_word:
+                corrected_text.append(corrected_word)
+        return ' '.join(corrected_text)
+    def overlay_annotations(self, frame, annotated_frame):
+        """Overlay annotations from the annotated frame onto the current frame."""
+        alpha = 0.4  # Transparency factor
+        cv2.addWeighted(annotated_frame, alpha, frame, 1 - alpha, 0, frame)
+        return frame

src/video_processor.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import cv2
+from collections import deque
+from streamlit_webrtc import VideoProcessorBase
+from src.utilities import Utilities
+from src.st_context import with_streamlit_context
+class VideoProcessor(VideoProcessorBase):
+    def __init__(self, queues, conf_thresh, n, k):
+        self.queues = queues
+        self.conf_thresh = conf_thresh
+        self.n = n
+        self.k = k
+        self.frame_counter = 0
+        self.utilities = Utilities()
+        self.annotation_counter = 0
+    @with_streamlit_context
+    def transform(self, frame):
+        img = frame.to_ndarray(format="bgr24")
+        self.frame_counter += 1
+        if self.frame_counter % self.n == 0:
+            # Preprocess the frame
+            preprocessed_frame = self.utilities.preprocess_image(img)
+            # Add frame to queue if it is empty
+            if self.queues['frame_queue'].empty():
+                self.queues['frame_queue'].put(preprocessed_frame)
+            annotations = self.utilities.detect_annotations(preprocessed_frame, self.queues['text_queue'], self.conf_thresh)
+            if annotations:
+                self.queues['annotation_queue'].put((annotations, self.k))  # Store annotations with counter
+                self.annotation_counter = self.k  # Reset the counter
+        # Draw annotations from the queue
+        if not self.queues['annotation_queue'].empty() and self.annotation_counter > 0:
+            annotations, _ = self.queues['annotation_queue'].get()
+            img = self.utilities.draw_annotations(img, annotations)
+            self.annotation_counter -= 1
+        return img