File size: 1,727 Bytes
bff554c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Must be the first Streamlit command
st.set_page_config(page_title="Khmer Text Summarization", page_icon="πŸ“", layout="wide")

import torch

MODEL_ID = "songhieng/khmer-mt5-summarization-duplicated"

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
    return tokenizer, model

tokenizer, model = load_model()

st.title("πŸ“ Khmer Text Summarization")
st.markdown("Input Khmer text and get a concise summary powered by your fine-tuned mT5 model.")

# Sidebar controls
st.sidebar.header("Settings")
max_length = st.sidebar.slider("Max summary length", 50, 300, 150, step=10)
min_length = st.sidebar.slider("Min summary length", 10, 100, 30, step=5)
num_beams = st.sidebar.slider("Number of beams", 1, 10, 4)

# Input text
text = st.text_area("✏️ Paste Khmer text below:", height=300, placeholder="αžŸαžΌαž˜αžœαžΆαž™αž’αžαŸ’αžαž”αž‘αžαŸ’αž˜αŸ‚αžšαž“αŸ…αž‘αžΈαž“αŸαŸ‡β€¦")

if st.button("πŸ” Summarize"):
    if not text.strip():
        st.warning("⚠️ Please enter some text.")
    else:
        with st.spinner("Summarizing..."):
            inputs = tokenizer(text, return_tensors="pt", truncation=True)
            summary_ids = model.generate(
                **inputs,
                max_length=max_length,
                min_length=min_length,
                num_beams=num_beams,
                length_penalty=2.0,
                early_stopping=True
            )
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        st.subheader("πŸ“„ Summary")
        st.success(summary)