Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,42 +3,29 @@ import re
|
|
| 3 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
| 4 |
import torch
|
| 5 |
import gradio as gr
|
| 6 |
-
import
|
| 7 |
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
import os
|
| 9 |
|
| 10 |
-
# OCR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
model_name = "PleIAs/OCRonos-Vintage"
|
| 12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
-
|
| 14 |
-
# Load pre-trained model and tokenizer
|
| 15 |
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
|
| 16 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
font-size: 1.2em;
|
| 25 |
-
}
|
| 26 |
-
.inserted {
|
| 27 |
-
background-color: #90EE90;
|
| 28 |
-
}
|
| 29 |
-
</style>
|
| 30 |
-
"""
|
| 31 |
-
|
| 32 |
-
def generate_html_diff(old_text, new_text):
|
| 33 |
-
d = difflib.Differ()
|
| 34 |
-
diff = list(d.compare(old_text.split(), new_text.split()))
|
| 35 |
-
html_diff = []
|
| 36 |
-
for word in diff:
|
| 37 |
-
if word.startswith(' '):
|
| 38 |
-
html_diff.append(word[2:])
|
| 39 |
-
elif word.startswith('+ '):
|
| 40 |
-
html_diff.append(f'<span class="inserted">{word[2:]}</span>')
|
| 41 |
-
return ' '.join(html_diff)
|
| 42 |
|
| 43 |
def split_text(text, max_tokens=400):
|
| 44 |
tokens = tokenizer.tokenize(text)
|
|
@@ -86,18 +73,22 @@ def process_text(user_message):
|
|
| 86 |
corrected_chunks.append(corrected_chunk)
|
| 87 |
|
| 88 |
corrected_text = ' '.join(corrected_chunks)
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
process_button.click(process_text, inputs=text_input, outputs=[text_output])
|
| 102 |
|
| 103 |
if __name__ == "__main__":
|
|
|
|
| 3 |
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
| 4 |
import torch
|
| 5 |
import gradio as gr
|
| 6 |
+
from difflib import Differ
|
| 7 |
from concurrent.futures import ThreadPoolExecutor
|
| 8 |
import os
|
| 9 |
|
| 10 |
+
description = """# 🙋🏻♂️Welcome to Tonic's On-Device📲⌚🎅🏻OCR Corrector (CPU)
|
| 11 |
+
📲⌚🎅🏻OCRonos-Vintage is a small specialized model for OCR correction of cultural heritage archives pre-trained with llm.c. OCRonos-Vintage is only 124 million parameters. It can run easily on CPU or provide correction at scale on GPUs (>10k tokens/seconds) while providing a quality of correction comparable to GPT-4 or the llama version of OCRonos for English-speaking cultural archives.
|
| 12 |
+
|
| 13 |
+
### Join us :
|
| 14 |
+
🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
model_name = "PleIAs/OCRonos-Vintage"
|
| 18 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
+
🙋🏻♂️Welcome to Tonic's ⌚🎅🏻Vintage OCRonos Corrector (CPU)
|
|
|
|
| 20 |
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
|
| 21 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 22 |
|
| 23 |
+
def diff_texts(text1, text2):
|
| 24 |
+
d = Differ()
|
| 25 |
+
return [
|
| 26 |
+
(token[2:], token[0] if token[0] != " " else None)
|
| 27 |
+
for token in d.compare(text1.split(), text2.split())
|
| 28 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def split_text(text, max_tokens=400):
|
| 31 |
tokens = tokenizer.tokenize(text)
|
|
|
|
| 73 |
corrected_chunks.append(corrected_chunk)
|
| 74 |
|
| 75 |
corrected_text = ' '.join(corrected_chunks)
|
| 76 |
+
return diff_texts(user_message, corrected_text)
|
| 77 |
+
|
| 78 |
+
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
| 79 |
+
gr.MarkDown(description)
|
| 80 |
+
text_input = gr.Textbox(
|
| 81 |
+
label="↘️Enter 👁️OCR'ed Text Outputs Here",
|
| 82 |
+
info="""Hi there, ;fémy name à`gis tonic 45and i like to ride my vpotz""",
|
| 83 |
+
lines=5,
|
| 84 |
+
)
|
| 85 |
+
process_button = gr.Button("Correct using 📲⌚🎅🏻OCRonos")
|
| 86 |
+
text_output = gr.HighlightedText(
|
| 87 |
+
label="📲⌚🎅🏻OCRonos Correction:",
|
| 88 |
+
combine_adjacent=True,
|
| 89 |
+
show_legend=True,
|
| 90 |
+
color_map={"+": "green", "-": "red"}
|
| 91 |
+
)
|
| 92 |
process_button.click(process_text, inputs=text_input, outputs=[text_output])
|
| 93 |
|
| 94 |
if __name__ == "__main__":
|