File size: 7,903 Bytes
714a26c
 
f3db826
714a26c
5ec5649
f3db826
714a26c
24a7c39
714a26c
f3db826
5ec5649
714a26c
f3db826
714a26c
 
 
 
f3db826
 
 
 
 
714a26c
 
127d151
f3db826
0bd74cf
714a26c
f3db826
714a26c
 
0bd74cf
 
f3db826
127d151
 
 
 
0bd74cf
127d151
714a26c
127d151
 
 
 
 
 
 
 
 
 
f3db826
127d151
 
4feb757
127d151
 
 
 
 
4feb757
127d151
 
 
 
 
 
 
 
 
 
 
f3db826
 
127d151
f3db826
4feb757
88c59aa
4feb757
 
 
88c59aa
 
 
 
 
f3db826
 
 
4feb757
f3db826
 
 
 
 
 
 
 
127d151
 
 
 
 
 
1b16a8e
052a173
127d151
 
 
88c59aa
127d151
 
f3db826
7cabc55
f3db826
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7cabc55
127d151
 
 
714a26c
 
 
24a7c39
f3db826
714a26c
 
f3db826
0bd74cf
714a26c
24a7c39
 
f3db826
24a7c39
0bd74cf
f3db826
 
88c59aa
 
f3db826
 
 
 
 
 
714a26c
04c9731
4feb757
04c9731
24a7c39
83c575c
 
24a7c39
714a26c
04c9731
88c59aa
 
 
 
f3db826
 
 
 
88c59aa
127d151
4feb757
127d151
24a7c39
f3db826
 
 
 
 
714a26c
1c63838
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import gradio as gr
import spacy
import spacy_transformers 
from huggingface_hub import snapshot_download
import os
from collections import Counter

# Download the model from Hugging Face Hub and load it.
model_repo = "hksung/ASC_tagger_v2"
model_path = snapshot_download(model_repo) # the model; public
nlp = spacy.load(os.path.join(model_path, 'model-best'))

# add a sentencizer (if not already present)
if 'parser' not in nlp.pipe_names and 'senter' not in nlp.pipe_names:
    nlp.add_pipe('sentencizer')

def get_highlighted_text(doc):
    """
    Wraps detected ASCs in each sentence with a <span> tag that carries the entity tag
    in the data-entity attribute. The final HTML output is prepended with a CSS block that
    applies your desired styles, including a dark mode adjustment.
    """
    highlighted_sentences = []
    for sent in doc.sents:
        text = sent.text
        # Find all entities completely within this sentence.
        ents_in_sent = [ent for ent in doc.ents if ent.start_char >= sent.start_char and ent.end_char <= sent.end_char]
        if ents_in_sent:
            # Process entities in reverse order to preserve character offsets.
            ents_in_sent = sorted(ents_in_sent, key=lambda x: x.start_char, reverse=True)
            for ent in ents_in_sent:
                ent_start = ent.start_char - sent.start_char
                ent_end = ent.end_char - sent.start_char
                # Wrap the entity text with a <span> that carries the class and the data-entity attribute.
                text = (
                    text[:ent_start]
                    + f'<span class="entity" data-entity="{ent.label_}">'
                    + text[ent_start:ent_end]
                    + '</span>'
                    + text[ent_end:]
                )
        highlighted_sentences.append(text)
    result = "<br><br>".join(highlighted_sentences)
    
    style = """
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 20px;
            background-color: #f4f4f4;
            color: #000;
        }
        .container {
            max-width: 1000px;
            margin: 0 auto;
            padding: 20px;
            background-color: white;
            border-radius: 8px;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
            line-height: 2.2em;  /* Increased line spacing */
        }
        .entity {
            display: inline-block;
            border: none;
            border-radius: 2px;
            padding: 2px 5px;
            margin: 0 4px;
            position: relative;
            white-space: nowrap;
            line-height: 1.2;
            font-size: 0.8em;
            /* Ensure default text color */
            color: inherit;
        }
        /* Highlight background colors for entity types */
        .entity[data-entity="ATTR"] { background-color: #dbb6ab; }
        .entity[data-entity="INTRAN_S"] { background-color: #e7957f; }
        .entity[data-entity="INTRAN_MOT"] { background-color: #ebab22; }
        .entity[data-entity="INTRAN_RES"] { background-color: #f095cc; }
        .entity[data-entity="CAUS_MOT"] { background-color: #85a831; }
        .entity[data-entity="TRAN_S"] { background-color: #a0d4f7; }
        .entity[data-entity="TRAN_RES"] { background-color: #c7aefa; }
        .entity[data-entity="DITRAN"] { background-color: #b3f0f7; }
        .entity[data-entity="PASSIVE"] { background-color: #c3c0c0; }
        .entity[data-entity=""] { background-color: #cccccc; }
        /* Darker background colors for the entity label tooltips */
        .entity[data-entity="ATTR"]::after { background-color: #d29997; }
        .entity[data-entity="INTRAN_S"]::after { background-color: #ec6161; }
        .entity[data-entity="INTRAN_MOT"]::after { background-color: #eb9422; }
        .entity[data-entity="INTRAN_RES"]::after { background-color: #be5791; }
        .entity[data-entity="CAUS_MOT"]::after { background-color: #007030; }
        .entity[data-entity="TRAN_S"]::after { background-color: #3085ce; }
        .entity[data-entity="TRAN_RES"]::after { background-color: #8268cf; }
        .entity[data-entity="DITRAN"]::after { background-color: #449cbb; }
        .entity[data-entity="PASSIVE"]::after { background-color: #6b6b6b; }
        .entity[data-entity=""]::after { background-color: #888888; }
        /* Styling for the entity label tooltip */
        .entity::after {
            content: attr(data-entity);
            position: absolute;
            bottom: -2em;
            left: 0;
            right: 0;
            color: #fff;
            font-size: 0.6em;
            padding: 2px 4px;
            border-radius: 2px;
            text-align: center;
            min-width: 60px;
            white-space: nowrap;
        }
        /* Dark mode adjustments */
        @media (prefers-color-scheme: dark) {
            body {
                background-color: #181818;
                color: #e0e0e0;
            }
            .container {
                background-color: #282828;
                box-shadow: 0 0 10px rgba(255, 255, 255, 0.1);
            }
            /* Ensure text inside entities is visible in dark mode */
            .entity {
                color: #e0e0e0;
            }
            .entity::after {
                color: #fff;
            }
        }
    </style>
    """
    return style + "<div class='container'>" + result + "</div>"

def process_text(input_text):
    """
    Process input text to detect and tag ASCs.
    Returns a tuple: (HTML string, total tag count, dictionary of tag counts).
    """
    if not input_text.strip():
        return "No text provided. Please enter some text.", 0, {}
    
    doc = nlp(input_text)
    
    if not list(doc.sents):
        return "Please enter at least one sentence.", 0, {}
    
    if not doc.ents:
        return "No ASCs were detected.", 0, {}

    highlighted_html = get_highlighted_text(doc)
    
    # Count each tag type using collections.Counter
    tag_counter = Counter(ent.label_ for ent in doc.ents)
    
    total_count = sum(tag_counter.values())
    
    return highlighted_html, total_count, dict(tag_counter)

sample_text = (
    "When, while the lovely valley teems with vapor around me, and the meridian sun strikes the upper surface of the impenetrable foliage of my trees, and but a few stray gleams steal into the inner sanctuary, I throw myself down among the tall grass by the trickling stream; and, as I lie close to the earth, a thousand unknown plants are noticed by me: when I hear the buzz of the little world among the stalks, and grow familiar with the countless indescribable forms of the insects and flies, then I feel the presence of the Almighty, who formed us in his own image, and the breath of that celestial force fills my lungs with an ineffable wonder, drawing my soul into a silent communion with the eternal rhythms of the earth."
)

def fill_sample_text():
    return sample_text

with gr.Blocks() as demo:
    gr.Markdown("# ASC tagger demo")
    gr.Markdown(
        "Enter some text to have ASCs tagged. Use the button below to fill in sample text. "
        "Learn more about the related works [here](https://github.com/LCR-ADS-Lab/ASC-Treebank)."
    )
    input_textbox = gr.Textbox(lines=5, label="Input text", placeholder="Enter text here...")
    output_html = gr.HTML(label="Tagged text")
    output_count = gr.Number(label="Number of ASCs detected", precision=0)
    output_counts_by_type = gr.JSON(label="ASC counts by type")
    
    tag_btn = gr.Button("Tag ASCs")
    fill_btn = gr.Button("Sample text")
    
    fill_btn.click(fn=fill_sample_text, inputs=[], outputs=input_textbox)
    tag_btn.click(
        fn=process_text,
        inputs=input_textbox,
        outputs=[output_html, output_count, output_counts_by_type]
    )
if __name__ == "__main__":
    demo.launch()