IS361Group4 commited on
Commit
1927304
·
verified ·
1 Parent(s): ee76783

สร้าง app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -0
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import requests
5
+ import pandas as pd
6
+ import altair as alt
7
+ from collections import OrderedDict
8
+ from nltk.tokenize import sent_tokenize
9
+ import trafilatura
10
+
11
+ # Load the punkt tokenizer from nltk
12
+ import nltk
13
+ nltk.download('punkt')
14
+
15
+ # Load model and tokenizer
16
+ model_name = 'dejanseo/sentiment'
17
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
19
+
20
+ # Sentiment labels as textual descriptions
21
+ sentiment_labels = {
22
+ 0: "very positive",
23
+ 1: "positive",
24
+ 2: "somewhat positive",
25
+ 3: "neutral",
26
+ 4: "somewhat negative",
27
+ 5: "negative",
28
+ 6: "very negative"
29
+ }
30
+
31
+ # Background colors for sentiments
32
+ background_colors = {
33
+ "very positive": "rgba(0, 255, 0, 0.5)",
34
+ "positive": "rgba(0, 255, 0, 0.3)",
35
+ "somewhat positive": "rgba(0, 255, 0, 0.1)",
36
+ "neutral": "rgba(128, 128, 128, 0.1)",
37
+ "somewhat negative": "rgba(255, 0, 0, 0.1)",
38
+ "negative": "rgba(255, 0, 0, 0.3)",
39
+ "very negative": "rgba(255, 0, 0, 0.5)"
40
+ }
41
+
42
+ # Function to get text content from a URL
43
+ def get_text_from_url(url):
44
+ downloaded = trafilatura.fetch_url(url)
45
+ if downloaded:
46
+ return trafilatura.extract(downloaded)
47
+ return ""
48
+
49
+ # Function to classify text
50
+ def classify_text(text, max_length):
51
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
52
+ with torch.no_grad():
53
+ outputs = model(**inputs)
54
+ scores = torch.nn.functional.softmax(outputs.logits, dim=-1)[0].tolist()
55
+ return scores
56
+
57
+ # Function to handle long texts
58
+ def classify_long_text(text):
59
+ max_length = tokenizer.model_max_length
60
+ # Split the text into chunks
61
+ chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
62
+ aggregate_scores = [0] * len(sentiment_labels)
63
+ chunk_scores_list = []
64
+ for chunk in chunks:
65
+ chunk_scores = classify_text(chunk, max_length)
66
+ chunk_scores_list.append(chunk_scores)
67
+ aggregate_scores = [x + y for x, y in zip(aggregate_scores, chunk_scores)]
68
+ # Average the scores
69
+ aggregate_scores = [x / len(chunks) for x in aggregate_scores]
70
+ return aggregate_scores, chunk_scores_list, chunks
71
+
72
+ # Function to classify each sentence in the text
73
+ def classify_sentences(text):
74
+ sentences = sent_tokenize(text)
75
+ sentence_scores = []
76
+ for sentence in sentences:
77
+ scores = classify_text(sentence, tokenizer.model_max_length)
78
+ sentiment_idx = scores.index(max(scores))
79
+ sentiment = sentiment_labels[sentiment_idx]
80
+ sentence_scores.append((sentence, sentiment))
81
+ return sentence_scores
82
+
83
+ # Streamlit UI
84
+ st.title("Sentiment Classification Model by DEJAN")
85
+
86
+ url = st.text_input("Enter URL:")
87
+
88
+ if url:
89
+ text = get_text_from_url(url)
90
+ if text:
91
+ scores, chunk_scores_list, chunks = classify_long_text(text)
92
+ scores_dict = {sentiment_labels[i]: scores[i] for i in range(len(sentiment_labels))}
93
+
94
+ # Ensure the exact order of labels in the graph
95
+ sentiment_order = [
96
+ "very positive", "positive", "somewhat positive",
97
+ "neutral",
98
+ "somewhat negative", "negative", "very negative"
99
+ ]
100
+ ordered_scores_dict = OrderedDict((label, scores_dict[label]) for label in sentiment_order)
101
+
102
+ # Prepare the DataFrame and reindex
103
+ df = pd.DataFrame.from_dict(ordered_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
104
+
105
+ # Use Altair to plot the bar chart
106
+ chart = alt.Chart(df.reset_index()).mark_bar().encode(
107
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
108
+ y='Likelihood'
109
+ ).properties(
110
+ width=600,
111
+ height=400
112
+ )
113
+
114
+ st.altair_chart(chart, use_container_width=True)
115
+
116
+ # Display each chunk and its own chart
117
+ for i, (chunk_scores, chunk) in enumerate(zip(chunk_scores_list, chunks)):
118
+ chunk_scores_dict = {sentiment_labels[j]: chunk_scores[j] for j in range(len(sentiment_labels))}
119
+ ordered_chunk_scores_dict = OrderedDict((label, chunk_scores_dict[label]) for label in sentiment_order)
120
+ df_chunk = pd.DataFrame.from_dict(ordered_chunk_scores_dict, orient='index', columns=['Likelihood']).reindex(sentiment_order)
121
+
122
+ chunk_chart = alt.Chart(df_chunk.reset_index()).mark_bar().encode(
123
+ x=alt.X('index', sort=sentiment_order, title='Sentiment'),
124
+ y='Likelihood'
125
+ ).properties(
126
+ width=600,
127
+ height=400
128
+ )
129
+
130
+ st.write(f"Chunk {i + 1}:")
131
+ st.write(chunk)
132
+ st.altair_chart(chunk_chart, use_container_width=True)
133
+
134
+ # Sentence-level classification with background colors
135
+ st.write("Extracted Text with Sentiment Highlights:")
136
+ sentence_scores = classify_sentences(text)
137
+ for sentence, sentiment in sentence_scores:
138
+ bg_color = background_colors[sentiment]
139
+ st.markdown(f'<span style="background-color: {bg_color}">{sentence}</span>', unsafe_allow_html=True)
140
+
141
+ else:
142
+ st.write("Could not extract text from the provided URL.")
143
+
144
+ # Additional information at the end
145
+ st.markdown("""
146
+ Multi-label sentiment classification model developed by [Dejan Marketing](https://dejanmarketing.com/).
147
+
148
+ The model is designed to be deployed in an automated pipeline capable of classifying text sentiment for thousands (or even millions) of text chunks or as a part of a scraping pipeline. This is a demo model which may occassionally misclasify some texts. In a typical commercial project, a larger model is deployed for the task, and in special cases, a domain-specific model is developed for the client.
149
+
150
+ ### Engage Our Team
151
+ Interested in using this in an automated pipeline for bulk sentiment processing?
152
+
153
+ Please [book an appointment](https://dejanmarketing.com/conference/) to discuss your needs.
154
+ """)