File size: 7,556 Bytes
2be8732 fec227f 2be8732 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
from huggingface_hub import hf_hub_download
import joblib
repo_id = "DevBhojani/Classification-SamsumDataset"
model_filename = "random_forest_classifier_model.joblib"
model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
loaded_classifier_model = joblib.load(model_path)
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
repo_id = "DevBhojani/Classification-SamsumDataset"
model_filename = "random_forest_classifier_model.joblib"
vectorizer_filename = "tfidf_vectorizer.joblib"
model_path = hf_hub_download(repo_id=repo_id, filename=model_filename)
vectorizer_path = hf_hub_download(repo_id=repo_id, filename=vectorizer_filename)
loaded_classifier_model = joblib.load(model_path)
loaded_tfidf_vectorizer = joblib.load(vectorizer_path)
import gradio as gr
from transformers import pipeline, AutoTokenizer
import re
import contractions
# Assuming loaded_classifier_model and loaded_tfidf_vectorizer are already loaded from the previous cell
def remove_html_tags(text):
pattern = r'[^a-zA-Z0-9\s]'
text = re.sub(pattern, '', str(text))
return text
def remove_url(text):
pattern = re.compile(r'https?://\S+|www\.\S+')
return pattern.sub(r'', str(text))
def remove_emojis(text):
emoji_pattern = re.compile(
"["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags
u"\U00002700-\U000027BF" # miscellaneous symbols
u"\U0001F900-\U0001F9FF" # supplemental symbols
u"\U00002600-\U000026FF" # weather & other symbols
u"\U0001FA70-\U0001FAFF" # extended symbols
"]+",
flags=re.UNICODE
)
return emoji_pattern.sub(r'', str(text))
def expand_contractions(text):
return contractions.fix(text)
def remove_special_and_numbers(text):
return re.sub(r'[^a-zA-Z\s]', '', str(text))
def clean_text(text):
text = remove_url(text)
text = remove_emojis(text)
text = expand_contractions(text)
text = text.lower()
return text
summarizer = pipeline("summarization", model="luisotorres/bart-finetuned-samsum")
# summarizer2 = pipeline("summarization", model="knkarthick/MEETING_SUMMARY")
tokenizer = AutoTokenizer.from_pretrained("luisotorres/bart-finetuned-samsum")
def split_into_chunks(conversation, n=15):
lines = conversation.strip().split('\n')
chunk_size = max(1, len(lines) // n)
return ['\n'.join(lines[i:i+chunk_size]) for i in range(0, len(lines), chunk_size)]
def truncate_chunk(text, max_tokens=1024):
tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
return tokenizer.decode(tokens, skip_special_tokens=True)
def summarize_chunks(chunks, model):
summaries = []
for chunk in chunks:
chunk = chunk.strip()
if not chunk:
continue
try:
truncated_chunk = truncate_chunk(chunk)
summary = model(truncated_chunk, max_length=1024, min_length=20, do_sample=False)[0]['summary_text']
summaries.append(summary)
except Exception as e:
print(f"Error summarizing chunk: {e}")
return summaries
def combine_summaries(summaries):
return ' '.join(summaries)
def summarize_dialogue(conversation, model):
chunks = split_into_chunks(conversation, n=1)
summaries = summarize_chunks(chunks, model)
final_summary = combine_summaries(summaries)
return final_summary
def analyze_meeting_transcript(user_input):
if not user_input.strip():
return "Please enter some text to summarize.", ""
cleaned_input = clean_text(user_input)
summary1 = summarize_dialogue(cleaned_input, summarizer)
# Use the loaded vectorizer to transform the input
cleaned_input_vectorized = loaded_tfidf_vectorizer.transform([cleaned_input])
intent_classification = loaded_classifier_model.predict(cleaned_input_vectorized)[0]
# print(intent_classification)
# print(cleaned_input_vectorized)
# intent_classification = "Transactional Inquiry & Information Exchange"
# Format the intent classification output
formatted_intent = intent_classification.replace("__label__", "").replace("_", " ")
return summary1, formatted_intent
interface = gr.Interface(
fn=analyze_meeting_transcript,
inputs=gr.Textbox(label="Enter dialogue here", lines=12, placeholder="Paste your meeting transcript..."),
outputs=[
gr.Textbox(label="Summary (Luis Torres BART)"),
# gr.Textbox(label="Summary 2 (KN Karthick MEETING_SUMMARY)"),
gr.Textbox(label="Intent Classification") # Removed "Placeholder"
],
title="Meeting Transcript Analyzer",
description="Summarizes meeting dialogues and classifies the intent.",
allow_flagging="never",
examples=[
[
'''
Amanda: guess what!
Chris: hey ;) ur pregnant!
Amanda: I'm so proud of myself! Remember I go to these dancing classes with Michael?
Chris: Yeah?
Amanda: So we went yesterday and the instructor needed a partner to show the steps we had so far
Chris: so there's only one guy teaching you? without a female partner?
Amanda: Well, this time he was alone, BUT THAT'S NOT THE POINT! Listen!
Chris: yeah, sorry :D tell me!
Amanda: So he needed a partner and noone really knew the steps like perfectly
Amanda: and obviously noone wanted to be mocked
Amanda: so I thought, aaaah :D
Chris: u volunteered? really? you??
Amanda: yeah!
Chris: whooa! that's so great! #therapy #worthit :D
Amanda: yeah i know :D maybe one day i'll actually stop being so shy
Chris: that's definitely the first step! :D congrats!
Amanda: tx ^_^
Chris: what dance was it?
Amanda: English waltz
Chris: isn't it, like, SO difficult?
Amanda: yeah it is! but everyone said I looked like a pro :D
Chris: Well done!!
'''
],
["I have some exciting news to share!"],
[
'''
Beryl: Hello guys! How are you doing? We've lost contact for a few months now. Hope you are well.
Anton: A happy hello to you Beryl! Great to hear from you. We are fine, thanks. And yourself?
Beryl: I'm very well indeed. Thank you. Any changes in your setup?
Anton: Not really. SOS. Same Old Soup ;) But we are happy for that.
Beryl: Are you still running your lovely airbnb?
Anton: Oh yes, we are. We had a few months off during summer, our summer, but now bookings start flowing in. Well... Are you planning to visit us? You two are always welcome!
Beryl: You caught me here. I'm vaguely considering going down to Onrus again, most likely in January. What does it look like with vacancies then?
Anton: Perfect! Just give me your dates and I'll keep it booked for you.
Beryl: Would you prefer me to do it via airbnb website or just like this directly with you?
Anton: I think it'll be more advantageous for both of us to do it directly. Do you know exactly when you'll be coming?
Beryl: Not so much. Can I get back to you in 2, 3 days' time?
Anton: ASAP really. As I say we've been receiving bookings daily now.
Beryl: Well, no big deal. I'll be staying in Cape Town for a longer time and am quite flexible in my dates.
Anton: Will you be coming with Tino, if I may ask?
Beryl: No. I am single again. Hurray! So pls make it single occupancy any week in January, Anton.
Anton: Great! 4th till 12th?
Beryl: Very good. I'll call you beforehand from Cape Town. Greetings to you both!
Anton: Take care!'''
],
]
)
if __name__ == "__main__":
interface.launch(debug=True, share=True) |