Spaces:

Curative
/

t5-summarizer-gradio

Sleeping

File size: 4,138 Bytes

e56279f
0e8551f
 
e56279f
0e8551f
 
 
 
 
 
 
 
 
 
 
 
 
e56279f
fc959be
 
0e8551f
 
 
 
 
 
fc959be
 
 
 
0e8551f
08f21ce
 
0e8551f
 
 
fc959be
 
 
0e8551f
 
 
 
 
 
 
 
 
950842b
0e8551f
 
 
 
fc959be
 
0e8551f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950842b
fc959be
0e8551f
fc959be
0e8551f
fc959be
0e8551f
 
fc959be
0e8551f
 
 
 
 
 
 
 
 
 
 
08f21ce
fc959be
 
0e8551f
 
fc959be
0e8551f
fc959be
950842b
0e8551f
950842b
 
 
fc959be
950842b
 
0e8551f
950842b
fc959be

import gradio as gr
from transformers import pipeline, AutoTokenizer
import torch

# —— Lazy‑loaded pipelines & tokenizers —— #
summarizer = sentiment = ner = classifier = None
ner_tokenizer = None

def get_summarizer():
    global summarizer
    if summarizer is None:
        summarizer = pipeline(
            "summarization",
            model="Curative/t5-summarizer-cnn",
            framework="pt"
        )
    return summarizer

def get_sentiment():
    global sentiment
    if sentiment is None:
        sentiment = pipeline(
            "sentiment-analysis",
            model="distilbert-base-uncased-finetuned-sst-2-english",
            framework="pt"
        )
    return sentiment

def get_classifier():
    global classifier
    if classifier is None:
        classifier = pipeline(
            "zero-shot-classification",
            model="facebook/bart-large-mnli",
            framework="pt"
        )
    return classifier

def get_ner():
    global ner, ner_tokenizer
    if ner is None:
        # Load Fast tokenizer explicitly for proper aggregation
        ner_tokenizer = AutoTokenizer.from_pretrained(
            "elastic/distilbert-base-uncased-finetuned-conll03-english",
            use_fast=True
        )
        ner = pipeline(
            "ner",
            model="elastic/distilbert-base-uncased-finetuned-conll03-english",
            tokenizer=ner_tokenizer,
            aggregation_strategy="simple",
            framework="pt"
        )
    return ner

# —— Helper functions —— #
def chunk_and_summarize(text: str) -> str:
    """Split on sentences into ≤1,000 char chunks, summarize each, then join."""
    summarizer = get_summarizer()
    max_chunk = 1000
    sentences = text.split(". ")
    chunks, current = [], ""
    for sent in sentences:
        # +2 accounts for the period and space
        if len(current) + len(sent) + 2 <= max_chunk:
            current += sent + ". "
        else:
            chunks.append(current.strip())
            current = sent + ". "
    if current:
        chunks.append(current.strip())

    summaries = []
    for chunk in chunks:
        part = summarizer(
            chunk,
            max_length=150,
            min_length=40,
            do_sample=False
        )[0]["summary_text"]
        summaries.append(part)
    return " ".join(summaries)

def merge_entities(ents):
    """Merge sub‑word tokens (##…) into full words."""
    merged = []
    for e in ents:
        w, t = e["word"], e["entity_group"]
        if w.startswith("##") and merged:
            merged[-1]["word"] += w.replace("##", "")
        else:
            merged.append({"word": w, "type": t})
    return merged

def process(text, features):
    out = {}
    if "Summarization" in features:
        out["summary"] = chunk_and_summarize(text)  # :contentReference[oaicite:7]{index=7}
    if "Sentiment" in features:
        s = get_sentiment()(text)[0]
        out["sentiment"] = {"label": s["label"], "score": s["score"]}
    if "Classification" in features:
        labels = ["technology","sports","business","politics",
                  "health","science","travel","entertainment"]
        cls = get_classifier()(text, candidate_labels=labels)
        # Zip & sort
        pairs = sorted(
            zip(cls["labels"], cls["scores"]),
            key=lambda x: x[1],
            reverse=True
        )
        out["classification"] = [
            {"label": lbl, "score": scr} for lbl, scr in pairs
        ]
    if "Entities" in features:
        ents = get_ner()(text)
        out["entities"] = merge_entities(ents)        # :contentReference[oaicite:8]{index=8}
    return out

# —— Gradio UI —— #
with gr.Blocks() as demo:
    gr.Markdown("## 🛠️ Multi‑Feature NLP Service")
    inp = gr.Textbox(lines=8, placeholder="Enter your text here…")
    feats = gr.CheckboxGroup(
        ["Summarization","Sentiment","Classification","Entities"],
        label="Select features to run"
    )
    btn = gr.Button("Run")
    out = gr.JSON(label="Results")

    btn.click(process, [inp, feats], out)

demo.queue(api_open=True).launch()