File size: 1,459 Bytes
d1f836f
e20523b
d1f836f
c969a0f
1f33001
d1f836f
cde183a
f76838b
 
cb73ab4
f76838b
 
1f33001
d1f836f
 
1f33001
7e7fb74
d1f836f
 
1f33001
d1f836f
 
1f33001
d1f836f
 
 
 
 
 
 
 
 
 
 
 
1f33001
 
d1f836f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# app.py  – CPU-only summariser for Hugging Face Spaces (free tier)
from optimum.onnxruntime import ORTModelForSeq2SeqLM
import textwrap, gradio as gr
from transformers import AutoTokenizer, pipeline

# 1️⃣  small, open model that needs no access-token
MODEL_ID = "Xenova/distilbart-cnn-6-6"
# summariser = pipeline("summarization", model=MODEL_ID, device=-1)  # -1 = CPU
tok   = AutoTokenizer.from_pretrained(MODEL_ID)
model = ORTModelForSeq2SeqLM.from_pretrained(MODEL_ID)

summariser = pipeline("summarization", model=model, tokenizer=tok, device=-1)

# 2️⃣  rough char limit that maps to the model’s 1 024-token window
MAX_CHUNK = 3_500

def summarize(txt: str) -> str:
    """Chunk long transcripts, summarise each, then summarise the summaries."""
    chunks = textwrap.wrap(txt, MAX_CHUNK, break_long_words=False)
    partials = [
        summariser(c, max_length=160, min_length=30, do_sample=False)[0]["summary_text"]
        for c in chunks
    ]
    first_pass = " ".join(partials)
    # if we had to chunk, do a second pass to get a coherent overall summary
    if len(chunks) > 1:
        first_pass = summariser(first_pass, max_length=180, min_length=40, do_sample=False)[0]["summary_text"]
    return first_pass

demo = gr.Interface(
    fn=summarize,
    inputs=gr.Textbox(lines=20, label="Transcript"),
    outputs="text",
    title="Free Transcript Summariser – DistilBART-CNN",
)

if __name__ == "__main__":
    demo.launch()