|
import gradio as gr |
|
from gradio_rich_textbox import RichTextbox |
|
from PIL import Image |
|
from surya.ocr import run_ocr |
|
from surya.model.detection.segformer import load_model as load_det_model, load_processor as load_det_processor |
|
from surya.model.recognition.model import load_model as load_rec_model |
|
from surya.model.recognition.processor import load_processor as load_rec_processor |
|
from lang_list import TEXT_SOURCE_LANGUAGE_NAMES |
|
from gradio_client import Client |
|
from dotenv import load_dotenv |
|
import requests |
|
from io import BytesIO |
|
import cohere |
|
import os |
|
import re |
|
import pandas as pd |
|
|
|
|
|
title = "# Welcome to AyaTonic" |
|
description = "Learn a New Language With Aya" |
|
|
|
load_dotenv() |
|
COHERE_API_KEY = os.getenv('CO_API_KEY') |
|
SEAMLESSM4T = os.getenv('SEAMLESSM4T') |
|
df = pd.read_csv("lang_list.csv") |
|
|
|
inputlanguage = "" |
|
producetext = "\n\nProduce a complete expositional blog post in {target_language} based on the above :" |
|
formatinputstring = "\n\nthe above text is a learning aid. you must use rich text format to rewrite the above and add 1 . a red color tags for nouns 2. a blue color tag for verbs 3. a green color tag for adjectives and adverbs:" |
|
|
|
|
|
patterns = { |
|
"red": r'<span style="color: red;">(.*?)</span>', |
|
"blue": r'<span style="color: blue;">(.*?)</span>', |
|
"green": r'<span style="color: green;">(.*?)</span>', |
|
} |
|
|
|
|
|
matches = { |
|
"red": [], |
|
"blue": [], |
|
"green": [], |
|
} |
|
class TaggedPhraseExtractor: |
|
def __init__(self, text=''): |
|
self.text = text |
|
self.patterns = {} |
|
|
|
def set_text(self, text): |
|
"""Set the text to search within.""" |
|
self.text = text |
|
|
|
def add_pattern(self, color, pattern): |
|
"""Add a new color and its associated pattern.""" |
|
self.patterns[color] = pattern |
|
|
|
def extract_phrases(self): |
|
"""Extract phrases for all colors and patterns added.""" |
|
matches = {color: re.findall(pattern, self.text) for color, pattern in self.patterns.items()} |
|
return matches |
|
|
|
def print_phrases(self): |
|
"""Extract phrases and print them.""" |
|
matches = self.extract_phrases() |
|
for color, phrases in matches.items(): |
|
print(f"Phrases with color {color}:") |
|
for phrase in phrases: |
|
print(f"- {phrase}") |
|
print() |
|
|
|
co = cohere.Client(COHERE_API_KEY) |
|
audio_client = Client(SEAMLESSM4T) |
|
client = Client(SEAMLESSM4T) |
|
|
|
def process_audio_to_text(audio_path, inputlanguage="English"): |
|
""" |
|
Convert audio input to text using the Gradio client. |
|
""" |
|
audio_client = Client(SEAMLESSM4T) |
|
result = audio_client.predict( |
|
audio_path, |
|
inputlanguage, |
|
inputlanguage, |
|
api_name="/s2tt" |
|
) |
|
print("Audio Result: ", result) |
|
return result[0] |
|
|
|
def process_text_to_audio(text, translatefrom, translateto): |
|
""" |
|
Convert text input to audio using the Gradio client. |
|
""" |
|
audio_client = Client(SEAMLESSM4T) |
|
result = audio_client.predict( |
|
text, |
|
translatefrom, |
|
translateto, |
|
api_name="/t2st" |
|
) |
|
return result[0] |
|
|
|
class OCRProcessor: |
|
def __init__(self, langs=["en"]): |
|
self.langs = langs |
|
self.det_processor, self.det_model = load_det_processor(), load_det_model() |
|
self.rec_model, self.rec_processor = load_rec_model(), load_rec_processor() |
|
|
|
def process_image(self, image): |
|
""" |
|
Process a PIL image and return the OCR text. |
|
""" |
|
predictions = run_ocr([image], [self.langs], self.det_model, self.det_processor, self.rec_model, self.rec_processor) |
|
return predictions[0] |
|
|
|
def process_pdf(self, pdf_path): |
|
""" |
|
Process a PDF file and return the OCR text. |
|
""" |
|
predictions = run_ocr([pdf_path], [self.langs], self.det_model, self.det_processor, self.rec_model, self.rec_processor) |
|
return predictions[0] |
|
|
|
def process_input(image=None, file=None, audio=None, text="", translateto = "English", translatefrom = "English" ): |
|
ocr_processor = OCRProcessor() |
|
final_text = text |
|
if image is not None: |
|
ocr_prediction = ocr_processor.process_image(image) |
|
|
|
for idx in range(len((list(ocr_prediction)[0][1]))): |
|
final_text += " " |
|
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1] |
|
if file is not None: |
|
if file.name.lower().endswith(('.png', '.jpg', '.jpeg')): |
|
pil_image = Image.open(file) |
|
ocr_prediction = ocr_processor.process_image(pil_image) |
|
|
|
for idx in range(len((list(ocr_prediction)[0][1]))): |
|
final_text += " " |
|
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1] |
|
elif file.name.lower().endswith('.pdf'): |
|
ocr_prediction = ocr_processor.process_pdf(file.name) |
|
|
|
for idx in range(len((list(ocr_prediction)[0][1]))): |
|
final_text += " " |
|
final_text += list((list(ocr_prediction)[0][1])[idx])[1][1] |
|
else: |
|
final_text += "\nUnsupported file type." |
|
print("OCR Text: ", final_text) |
|
if audio is not None: |
|
audio_text = process_audio_to_text(audio) |
|
final_text += "\n" + audio_text |
|
|
|
final_text_with_producetext = final_text + producetext |
|
|
|
response = co.generate( |
|
model='c4ai-aya', |
|
prompt=final_text_with_producetext, |
|
max_tokens=1024, |
|
temperature=0.5 |
|
) |
|
|
|
generated_text = response.generations[0].text |
|
print("Generated Text: ", generated_text) |
|
generated_text_with_format = generated_text + "\n" + formatinputstring |
|
response = co.generate( |
|
model='command-nightly', |
|
prompt=generated_text_with_format, |
|
max_tokens=4000, |
|
temperature=0.5 |
|
) |
|
processed_text = response.generations[0].text |
|
|
|
audio_output = process_text_to_audio(processed_text, translateto, translateto) |
|
|
|
return processed_text, audio_output |
|
|
|
def main(): |
|
with gr.Blocks() as demo: |
|
gr.Markdown(title) |
|
gr.Markdown(description) |
|
|
|
with gr.Row(): |
|
input_language = gr.Dropdown(choices=df["name"].to_list(), label="Your Native Language") |
|
target_language = gr.Dropdown(choices=df["name"].to_list(), label="Language To Learn") |
|
|
|
with gr.Accordion("Talk To 🌟AyaTonic"): |
|
with gr.Tab("🤙🏻Audio & Text"): |
|
audio_input = gr.Audio(sources="microphone", type="filepath", label="Mic Input") |
|
text_input = gr.Textbox(lines=2, label="Text Input") |
|
with gr.Tab("📸Image & File"): |
|
image_input = gr.Image(type="pil", label="Camera Input") |
|
file_input = gr.File(label="File Upload") |
|
|
|
process_button = gr.Button("🌟AyaTonic") |
|
|
|
processed_text_output = RichTextbox(label="Processed Text") |
|
audio_output = gr.Audio(label="Audio Output") |
|
|
|
process_button.click( |
|
fn=process_input, |
|
inputs=[image_input, file_input, audio_input, text_input, input_language, target_language], |
|
outputs=[processed_text_output, audio_output] |
|
) |
|
|
|
if __name__ == "__main__": |
|
main() |