File size: 2,365 Bytes
93d3903
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5dd6b29
 
 
 
93d3903
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import streamlit as st
from transformers import AutoTokenizer, T5ForConditionalGeneration
import post_ocr


# Load model
@st.cache_resource
def load_model():
    return T5ForConditionalGeneration.from_pretrained('viklofg/swedish-ocr-correction')
model = load_model()


# Load tokenizer
@st.cache_resource
def load_tokenizer():
    return AutoTokenizer.from_pretrained('google/byt5-small')
tokenizer = load_tokenizer()


# Set model and tokenizer
post_ocr.set_model(model, tokenizer)


# Title
st.title(':memo: Swedish OCR correction')
# Input and output areas
tab1, tab2 = st.tabs(["Text input", "From file"])


def clean_inputs():
    st.session_state.inputs = {'tab1': None, 'tab2': None}

if 'inputs' not in st.session_state:
    clean_inputs()


def clean_outputs():
    st.session_state.outputs = {'tab1': None, 'tab2': None}

if 'outputs' not in st.session_state:
    clean_outputs()


# Sidebar (settings and stuff)
with st.sidebar:
    st.header('Settings')
    n_candidates = st.number_input('Overlap', help='A higher value may lead to better quality, but takes longer time', value=1, min_value=1, max_value=7, step=2, on_change=clean_inputs)

    st.header('Output')
    show_changes = st.toggle('Show changes')


def handle_input(input_, id_):
   
    with st.container(border=True):
        st.caption('Output')

        # Only update the output if the input has been updated
        if input_ and st.session_state.inputs[id_] != input_:
            st.session_state.inputs[id_] = input_
            with st.spinner('Generating...'):
                output = post_ocr.process(input_, n_candidates)
            st.session_state.outputs[id_] = output

        # Display output
        output =  st.session_state.outputs[id_]
        if output is not None:
            st.write(post_ocr.diff(input_, output) if show_changes else output)


# Manual entry tab
with tab1:
    typed_input = st.text_area('Input OCR', placeholder='Enter OCR generated text', label_visibility='collapsed')
    handle_input(typed_input, 'tab1')


# File upload tab
with tab2:
    uploaded_file = st.file_uploader('Choose a file', type='.txt')

    # Display file content
    if uploaded_file is not None:
        file_content = uploaded_file.getvalue().decode('utf-8')
        text = st.text_area('File content', value=file_content, height=300)
        handle_input(text, 'tab2')