Spaces:

Curative
/

t5-summarizer-gradio

Sleeping

App Files Files Community

Curative commited on May 4

Commit

0e8551f

verified ·

1 Parent(s): e8f73fc

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -35

app.py CHANGED Viewed

@@ -1,72 +1,133 @@
 import gradio as gr
-from transformers import pipeline
-# Lazy‑load pipelines
-sentiment = classifier = ner = summarizer = None
 def get_sentiment():
     global sentiment
-    if not sentiment:
-        sentiment = pipeline("sentiment-analysis",
-            model="distilbert-base-uncased-finetuned-sst-2-english")
     return sentiment
 def get_classifier():
     global classifier
-    if not classifier:
         classifier = pipeline(
             "zero-shot-classification",
-            model="facebook/bart-large-mnli")
     return classifier
 def get_ner():
-    global ner
-    if not ner:
-        ner = pipeline("ner",
             model="elastic/distilbert-base-uncased-finetuned-conll03-english",
-            aggregation_strategy="simple")
     return ner
-def get_summarizer():
-    global summarizer
-    if not summarizer:
-        summarizer = pipeline("summarization",
-            model="Curative/t5-summarizer-cnn")
-    return summarizer
 def process(text, features):
-    result = {}
     if "Summarization" in features:
-        result["summary"] = get_summarizer()(
-            text, max_length=150, min_length=40, do_sample=False
-        )[0]["summary_text"]
     if "Sentiment" in features:
-        sent = get_sentiment()(text)[0]
-        result["sentiment"] = {"label": sent["label"], "score": sent["score"]}
     if "Classification" in features:
-        candidate_labels = [
-            "technology", "sports", "business", "politics",
-            "health", "science", "travel", "entertainment"
         ]
-        cls = get_classifier()(text, candidate_labels=candidate_labels)
-        # Map labels → scores
-        result["classification"] = dict(zip(cls["labels"], cls["scores"]))
     if "Entities" in features:
         ents = get_ner()(text)
-        result["entities"] = [
-            {"word": e["word"], "type": e["entity_group"]} for e in ents
-        ]
-    return result
 with gr.Blocks() as demo:
     gr.Markdown("## 🛠️ Multi‑Feature NLP Service")
-    inp = gr.Textbox(lines=6, placeholder="Enter your text here…")
     feats = gr.CheckboxGroup(
         ["Summarization","Sentiment","Classification","Entities"],
         label="Select features to run"
     )
     btn = gr.Button("Run")
     out = gr.JSON(label="Results")
     btn.click(process, [inp, feats], out)
 demo.queue(api_open=True).launch()

 import gradio as gr
+from transformers import pipeline, AutoTokenizer
+import torch
+# —— Lazy‑loaded pipelines & tokenizers —— #
+summarizer = sentiment = ner = classifier = None
+ner_tokenizer = None
+def get_summarizer():
+    global summarizer
+    if summarizer is None:
+        summarizer = pipeline(
+            "summarization",
+            model="Curative/t5-summarizer-cnn",
+            framework="pt"
+        )
+    return summarizer
 def get_sentiment():
     global sentiment
+    if sentiment is None:
+        sentiment = pipeline(
+            "sentiment-analysis",
+            model="distilbert-base-uncased-finetuned-sst-2-english",
+            framework="pt"
+        )
     return sentiment
 def get_classifier():
     global classifier
+    if classifier is None:
         classifier = pipeline(
             "zero-shot-classification",
+            model="facebook/bart-large-mnli",
+            framework="pt"
+        )
     return classifier
 def get_ner():
+    global ner, ner_tokenizer
+    if ner is None:
+        # Load Fast tokenizer explicitly for proper aggregation
+        ner_tokenizer = AutoTokenizer.from_pretrained(
+            "elastic/distilbert-base-uncased-finetuned-conll03-english",
+            use_fast=True
+        )
+        ner = pipeline(
+            "ner",
             model="elastic/distilbert-base-uncased-finetuned-conll03-english",
+            tokenizer=ner_tokenizer,
+            aggregation_strategy="simple",
+            framework="pt"
+        )
     return ner
+# —— Helper functions —— #
+def chunk_and_summarize(text: str) -> str:
+    """Split on sentences into ≤1,000 char chunks, summarize each, then join."""
+    summarizer = get_summarizer()
+    max_chunk = 1000
+    sentences = text.split(". ")
+    chunks, current = [], ""
+    for sent in sentences:
+        # +2 accounts for the period and space
+        if len(current) + len(sent) + 2 <= max_chunk:
+            current += sent + ". "
+        else:
+            chunks.append(current.strip())
+            current = sent + ". "
+    if current:
+        chunks.append(current.strip())
+    summaries = []
+    for chunk in chunks:
+        part = summarizer(
+            chunk,
+            max_length=150,
+            min_length=40,
+            do_sample=False
+        )[0]["summary_text"]
+        summaries.append(part)
+    return " ".join(summaries)
+def merge_entities(ents):
+    """Merge sub‑word tokens (##…) into full words."""
+    merged = []
+    for e in ents:
+        w, t = e["word"], e["entity_group"]
+        if w.startswith("##") and merged:
+            merged[-1]["word"] += w.replace("##", "")
+        else:
+            merged.append({"word": w, "type": t})
+    return merged
 def process(text, features):
+    out = {}
     if "Summarization" in features:
+        out["summary"] = chunk_and_summarize(text)  # :contentReference[oaicite:7]{index=7}
     if "Sentiment" in features:
+        s = get_sentiment()(text)[0]
+        out["sentiment"] = {"label": s["label"], "score": s["score"]}
     if "Classification" in features:
+        labels = ["technology","sports","business","politics",
+                  "health","science","travel","entertainment"]
+        cls = get_classifier()(text, candidate_labels=labels)
+        # Zip & sort
+        pairs = sorted(
+            zip(cls["labels"], cls["scores"]),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        out["classification"] = [
+            {"label": lbl, "score": scr} for lbl, scr in pairs
         ]
     if "Entities" in features:
         ents = get_ner()(text)
+        out["entities"] = merge_entities(ents)        # :contentReference[oaicite:8]{index=8}
+    return out
+# —— Gradio UI —— #
 with gr.Blocks() as demo:
     gr.Markdown("## 🛠️ Multi‑Feature NLP Service")
+    inp = gr.Textbox(lines=8, placeholder="Enter your text here…")
     feats = gr.CheckboxGroup(
         ["Summarization","Sentiment","Classification","Entities"],
         label="Select features to run"
     )
     btn = gr.Button("Run")
     out = gr.JSON(label="Results")
     btn.click(process, [inp, feats], out)
 demo.queue(api_open=True).launch()