Spaces:
Running
Running
File size: 4,472 Bytes
4bba8df 89d4ec8 4bba8df 471013f 4bba8df af93ecc 4bba8df c259974 a27d7dd 89d4ec8 c259974 89d4ec8 c259974 89d4ec8 4bba8df 1849f87 4bba8df 2d98e08 fb44459 829404f fb44459 5bdfb1c fb44459 2490733 fb44459 af93ecc 5bdfb1c 98dfbf2 9ec907c 98dfbf2 9ec907c 98dfbf2 fb44459 fb0c046 3772be3 6796ab1 fb44459 5bdfb1c fb44459 5bdfb1c 4bba8df fb44459 4bba8df fb44459 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import gradio as gr
import os
import torch
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from huggingface_hub import HfApi
from label_dicts import CAP_MIN_NUM_DICT, CAP_MIN_LABEL_NAMES, CAP_LABEL_NAMES
from .utils import is_disk_full
from itertools import islice
def take(n, iterable):
"""Return the first n items of the iterable as a list."""
return list(islice(iterable, n))
def score_to_color(prob):
red = int(255 * (1 - prob))
green = int(255 * prob)
return f"rgb({red},{green},0)"
HF_TOKEN = os.environ["hf_read"]
languages = [
"Multilingual",
]
domains = {
"media": "media",
"social media": "social",
"parliamentary speech": "parlspeech",
"legislative documents": "legislative",
"executive speech": "execspeech",
"executive order": "execorder",
"party programs": "party",
"judiciary": "judiciary",
"budget": "budget",
"public opinion": "publicopinion",
"local government agenda": "localgovernment"
}
def convert_minor_to_major(minor_topic):
if minor_topic == 999:
major_code = 999
else:
major_code = str(minor_topic)[:-2]
label = CAP_LABEL_NAMES[int(major_code)]
return label
def check_huggingface_path(checkpoint_path: str):
try:
hf_api = HfApi(token=HF_TOKEN)
hf_api.model_info(checkpoint_path, token=HF_TOKEN)
return True
except:
return False
def build_huggingface_path(language: str, domain: str):
return "poltextlab/xlm-roberta-large-pooled-cap-minor-v3"
def predict(text, model_id, tokenizer_id):
device = torch.device("cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
inputs = tokenizer(text,
max_length=256,
truncation=True,
padding="do_not_pad",
return_tensors="pt").to(device)
model.eval()
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
output_pred = {f"[{'999' if str(CAP_MIN_NUM_DICT[i]) == '999' else str(CAP_MIN_NUM_DICT[i])[:-2]}]{convert_minor_to_major(CAP_MIN_NUM_DICT[i])} [{CAP_MIN_NUM_DICT[i]}]{CAP_MIN_LABEL_NAMES[CAP_MIN_NUM_DICT[i]]}": probs[i] for i in np.argsort(probs)[::-1]}
output_pred = dict(sorted(output_pred.items(), key=lambda item: item[1], reverse=True))
first_n_items = take(5, output_pred.items())
html = ""
html += '<div style="background-color: white">'
first = True
for label, prob in first_n_items:
bar_color = "#e0d890" if first else "#ccc"
text_color = "black"
bar_width = int(prob * 100)
bar_color = score_to_color(prob)
if first:
html += f"""
<div style="text-align: center; font-weight: bold; font-size: 30px; margin-bottom: 10px;">
<span style="color: {text_color};">{label}</span>
</div>"""
html += f"""
<div style="height: 4px; background-color: green; width: {bar_width}%; margin-bottom: 8px;"></div>
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: 4px;">
<span style="color: {text_color};">{label} — {int(prob * 100)}%</span>
</div>
"""
first = False
html += '</div>'
output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
return html, output_info
def predict_cap(text, language, domain):
domain = domains[domain]
model_id = build_huggingface_path(language, domain)
tokenizer_id = "xlm-roberta-large"
if is_disk_full():
os.system('rm -rf /data/models*')
os.system('rm -r ~/.cache/huggingface/hub')
return predict(text, model_id, tokenizer_id)
demo = gr.Interface(
title="CAP Minor Topics Babel Demo",
fn=predict_cap,
inputs=[gr.Textbox(lines=6, label="Input"),
gr.Dropdown(languages, label="Language"),
gr.Dropdown(domains.keys(), label="Domain")],
outputs=[gr.HTML(label="Output"), gr.Markdown()])
|