File size: 1,727 Bytes
bff554c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Must be the first Streamlit command
st.set_page_config(page_title="Khmer Text Summarization", page_icon="π", layout="wide")
import torch
MODEL_ID = "songhieng/khmer-mt5-summarization-duplicated"
@st.cache_resource
def load_model():
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
return tokenizer, model
tokenizer, model = load_model()
st.title("π Khmer Text Summarization")
st.markdown("Input Khmer text and get a concise summary powered by your fine-tuned mT5 model.")
# Sidebar controls
st.sidebar.header("Settings")
max_length = st.sidebar.slider("Max summary length", 50, 300, 150, step=10)
min_length = st.sidebar.slider("Min summary length", 10, 100, 30, step=5)
num_beams = st.sidebar.slider("Number of beams", 1, 10, 4)
# Input text
text = st.text_area("βοΈ Paste Khmer text below:", height=300, placeholder="ααΌαααΆαα’αααααααααααα
ααΈαααβ¦")
if st.button("π Summarize"):
if not text.strip():
st.warning("β οΈ Please enter some text.")
else:
with st.spinner("Summarizing..."):
inputs = tokenizer(text, return_tensors="pt", truncation=True)
summary_ids = model.generate(
**inputs,
max_length=max_length,
min_length=min_length,
num_beams=num_beams,
length_penalty=2.0,
early_stopping=True
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
st.subheader("π Summary")
st.success(summary)
|