File size: 4,138 Bytes
e56279f
0e8551f
 
e56279f
0e8551f
 
 
 
 
 
 
 
 
 
 
 
 
e56279f
fc959be
 
0e8551f
 
 
 
 
 
fc959be
 
 
 
0e8551f
08f21ce
 
0e8551f
 
 
fc959be
 
 
0e8551f
 
 
 
 
 
 
 
 
950842b
0e8551f
 
 
 
fc959be
 
0e8551f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
950842b
fc959be
0e8551f
fc959be
0e8551f
fc959be
0e8551f
 
fc959be
0e8551f
 
 
 
 
 
 
 
 
 
 
08f21ce
fc959be
 
0e8551f
 
fc959be
0e8551f
fc959be
950842b
0e8551f
950842b
 
 
fc959be
950842b
 
0e8551f
950842b
fc959be
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr
from transformers import pipeline, AutoTokenizer
import torch

# —— Lazy‑loaded pipelines & tokenizers —— #
summarizer = sentiment = ner = classifier = None
ner_tokenizer = None

def get_summarizer():
    global summarizer
    if summarizer is None:
        summarizer = pipeline(
            "summarization",
            model="Curative/t5-summarizer-cnn",
            framework="pt"
        )
    return summarizer

def get_sentiment():
    global sentiment
    if sentiment is None:
        sentiment = pipeline(
            "sentiment-analysis",
            model="distilbert-base-uncased-finetuned-sst-2-english",
            framework="pt"
        )
    return sentiment

def get_classifier():
    global classifier
    if classifier is None:
        classifier = pipeline(
            "zero-shot-classification",
            model="facebook/bart-large-mnli",
            framework="pt"
        )
    return classifier

def get_ner():
    global ner, ner_tokenizer
    if ner is None:
        # Load Fast tokenizer explicitly for proper aggregation
        ner_tokenizer = AutoTokenizer.from_pretrained(
            "elastic/distilbert-base-uncased-finetuned-conll03-english",
            use_fast=True
        )
        ner = pipeline(
            "ner",
            model="elastic/distilbert-base-uncased-finetuned-conll03-english",
            tokenizer=ner_tokenizer,
            aggregation_strategy="simple",
            framework="pt"
        )
    return ner

# —— Helper functions —— #
def chunk_and_summarize(text: str) -> str:
    """Split on sentences into ≤1,000 char chunks, summarize each, then join."""
    summarizer = get_summarizer()
    max_chunk = 1000
    sentences = text.split(". ")
    chunks, current = [], ""
    for sent in sentences:
        # +2 accounts for the period and space
        if len(current) + len(sent) + 2 <= max_chunk:
            current += sent + ". "
        else:
            chunks.append(current.strip())
            current = sent + ". "
    if current:
        chunks.append(current.strip())

    summaries = []
    for chunk in chunks:
        part = summarizer(
            chunk,
            max_length=150,
            min_length=40,
            do_sample=False
        )[0]["summary_text"]
        summaries.append(part)
    return " ".join(summaries)

def merge_entities(ents):
    """Merge sub‑word tokens (##…) into full words."""
    merged = []
    for e in ents:
        w, t = e["word"], e["entity_group"]
        if w.startswith("##") and merged:
            merged[-1]["word"] += w.replace("##", "")
        else:
            merged.append({"word": w, "type": t})
    return merged

def process(text, features):
    out = {}
    if "Summarization" in features:
        out["summary"] = chunk_and_summarize(text)  # :contentReference[oaicite:7]{index=7}
    if "Sentiment" in features:
        s = get_sentiment()(text)[0]
        out["sentiment"] = {"label": s["label"], "score": s["score"]}
    if "Classification" in features:
        labels = ["technology","sports","business","politics",
                  "health","science","travel","entertainment"]
        cls = get_classifier()(text, candidate_labels=labels)
        # Zip & sort
        pairs = sorted(
            zip(cls["labels"], cls["scores"]),
            key=lambda x: x[1],
            reverse=True
        )
        out["classification"] = [
            {"label": lbl, "score": scr} for lbl, scr in pairs
        ]
    if "Entities" in features:
        ents = get_ner()(text)
        out["entities"] = merge_entities(ents)        # :contentReference[oaicite:8]{index=8}
    return out

# —— Gradio UI —— #
with gr.Blocks() as demo:
    gr.Markdown("## 🛠️ Multi‑Feature NLP Service")
    inp = gr.Textbox(lines=8, placeholder="Enter your text here…")
    feats = gr.CheckboxGroup(
        ["Summarization","Sentiment","Classification","Entities"],
        label="Select features to run"
    )
    btn = gr.Button("Run")
    out = gr.JSON(label="Results")

    btn.click(process, [inp, feats], out)

demo.queue(api_open=True).launch()