Spaces:
Sleeping
Sleeping
import streamlit as st | |
from doctr.io import DocumentFile | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.pdfbase.ttfonts import TTFont | |
from PIL import Image | |
import pytesseract | |
import utils | |
# Register a Unicode-compatible font | |
fontname = "Ubuntu" | |
fontpath = "./Ubuntu-Regular.ttf" | |
reco_arch = "kz_latest.pt" | |
pdfmetrics.registerFont(TTFont(fontname, fontpath)) | |
use_pytesseract = True | |
def main(): | |
"""Построение интерфейса Streamlit""" | |
# Широкий режим - must be first! | |
st.set_page_config(layout="wide") | |
# Hide Streamlit elements and set background | |
hide_st_style = """ | |
<style> | |
#MainMenu {visibility: hidden;} | |
footer {visibility: hidden;} | |
header {visibility: hidden;} | |
</style> | |
""" | |
st.markdown(hide_st_style, unsafe_allow_html=True) | |
# Дизайн интерфейса | |
st.title("Қазақша жазылған құжаттардың OCR") | |
# Move file upload to top | |
uploaded_file = st.file_uploader( | |
"Файлдарды жүктеңіз", type=["pdf", "png", "jpeg", "jpg"] | |
) | |
# Новая строка | |
st.write("\n") | |
# Установка колонок | |
cols = st.columns((1, 1)) | |
cols[0].subheader("Бастапқы бет") | |
cols[1].subheader("Мәтіннің біріктірілген нұсқасы") | |
if uploaded_file is not None: | |
print(uploaded_file.name) | |
if uploaded_file.name.lower().endswith(".pdf"): | |
doc = DocumentFile.from_pdf(uploaded_file.read()) | |
else: | |
doc = DocumentFile.from_images(uploaded_file.read()) | |
page_idx = ( | |
st.selectbox("Бетті таңдау", [idx + 1 for idx in range(len(doc))]) | |
- 1 | |
) | |
page = doc[page_idx] | |
cols[0].image(page) | |
with st.spinner("Модельді жүктеу..."): | |
predictor = utils.get_ocr_predictor( | |
reco_arch=reco_arch, | |
) | |
with st.spinner("Талдау..."): | |
out = predictor([page]) | |
page_export = out.pages[0].export() | |
(coordinates, _, _) = utils.page_to_coordinates(page_export) | |
# Пропуск изображения через модель | |
''' | |
boxes_with_labels = utils.draw_boxes_with_labels( | |
page, coordinates, font_path="./Ubuntu-Regular.ttf" | |
) | |
cols[1].image(boxes_with_labels) | |
''' | |
# Отображение объединенного текста | |
final_text = utils.ocr_to_txt(coordinates) | |
cols[1].text_area("Мәтіннің біріктірілген нұсқасы:", final_text, height=500) | |
# Use pytesseract if checkbox is selected | |
if use_pytesseract: | |
if uploaded_file.name.lower().endswith(('.png', '.jpg', '.jpeg')): | |
image = Image.open(uploaded_file) | |
ocr_text = pytesseract.image_to_string(image, lang="kaz+eng+rus") | |
# Create a collapsible block for OCR results | |
with st.expander("OCR нәтижесі (pytesseract)"): | |
st.text_area("Тексеру нәтижесі:", ocr_text, height=300) | |
else: | |
st.warning("OCR тек суреттер үшін қол жетімді.") | |
if __name__ == "__main__": | |
main() | |