Spaces:
Sleeping
Sleeping
Update tweet_analyzer.py
Browse files- tweet_analyzer.py +33 -52
tweet_analyzer.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import os
|
2 |
from PyPDF2 import PdfReader
|
3 |
import pandas as pd
|
4 |
from dotenv import load_dotenv
|
@@ -7,6 +7,8 @@ import json
|
|
7 |
from datetime import datetime
|
8 |
from sklearn.decomposition import NMF
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
|
10 |
import random
|
11 |
|
12 |
class TweetDatasetProcessor:
|
@@ -15,6 +17,7 @@ class TweetDatasetProcessor:
|
|
15 |
self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
|
16 |
self.tweets = []
|
17 |
self.personality_profile = {}
|
|
|
18 |
|
19 |
def extract_text_from_pdf(self, pdf_path):
|
20 |
"""Extract text content from PDF file."""
|
@@ -60,10 +63,6 @@ class TweetDatasetProcessor:
|
|
60 |
df.to_csv('processed_tweets.csv', index=False)
|
61 |
return df
|
62 |
|
63 |
-
def _extract_timestamp(self, text):
|
64 |
-
"""Extract timestamp if present in tweet."""
|
65 |
-
return None # Implement timestamp extraction logic if needed
|
66 |
-
|
67 |
def _extract_mentions(self, text):
|
68 |
"""Extract mentioned users from tweet."""
|
69 |
return [word for word in text.split() if word.startswith('@')]
|
@@ -72,13 +71,25 @@ class TweetDatasetProcessor:
|
|
72 |
"""Extract hashtags from tweet."""
|
73 |
return [word for word in text.split() if word.startswith('#')]
|
74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
def analyze_personality(self):
|
76 |
-
"""Comprehensive personality analysis."""
|
77 |
all_tweets = [tweet['content'] for tweet in self.tweets]
|
78 |
-
|
|
|
79 |
Core beliefs, emotional tendencies, cognitive patterns, etc.
|
80 |
Tweets for analysis:
|
81 |
-
{json.dumps(all_tweets
|
82 |
"""
|
83 |
|
84 |
response = self.groq_client.chat.completions.create(
|
@@ -92,32 +103,26 @@ class TweetDatasetProcessor:
|
|
92 |
self.personality_profile = response.choices[0].message.content
|
93 |
return self.personality_profile
|
94 |
|
95 |
-
def analyze_topics(self, n_topics=
|
96 |
"""Extract and identify different topics the author has tweeted about."""
|
97 |
all_tweets = [tweet['content'] for tweet in self.tweets]
|
98 |
-
|
99 |
-
tfidf_matrix = vectorizer.fit_transform(all_tweets)
|
100 |
nmf_model = NMF(n_components=n_topics, random_state=1)
|
101 |
nmf_model.fit(tfidf_matrix)
|
102 |
|
103 |
topics = []
|
104 |
for topic_idx, topic in enumerate(nmf_model.components_):
|
105 |
-
topic_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
|
106 |
topics.append(" ".join(topic_words))
|
107 |
-
|
108 |
-
# Remove duplicates in topics
|
109 |
-
topics = list(set(topics))
|
110 |
-
return topics
|
111 |
|
112 |
def count_tokens(self, text):
|
113 |
"""Estimate the number of tokens in the given text."""
|
114 |
-
# A basic token count estimation (approximate)
|
115 |
return len(text.split())
|
116 |
|
117 |
def generate_tweet(self, context=""):
|
118 |
"""Generate a new tweet based on personality profile and optional context."""
|
119 |
-
|
120 |
-
historical_topics = self.analyze_topics(n_topics=3) # Reduced number of topics
|
121 |
additional_contexts = historical_topics + [
|
122 |
"Comment on a recent technological advancement.",
|
123 |
"Share a motivational thought.",
|
@@ -125,48 +130,26 @@ class TweetDatasetProcessor:
|
|
125 |
"Reflect on a past experience.",
|
126 |
"Provide advice to followers."
|
127 |
]
|
128 |
-
|
129 |
-
# Randomly select multiple contexts to increase diversity
|
130 |
selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))
|
131 |
|
132 |
-
#
|
133 |
-
|
134 |
-
|
|
|
|
|
135 |
|
136 |
-
|
137 |
-
personality_profile_excerpt = self.personality_profile[:400] # Further truncation
|
138 |
|
139 |
-
# Combine everything and check token count
|
140 |
prompt = f"""Based on this personality profile:
|
141 |
{personality_profile_excerpt}
|
142 |
Current context or topic (if any):
|
143 |
{context}
|
144 |
Additionally, consider these contexts to increase diversity:
|
145 |
{', '.join(selected_contexts)}
|
146 |
-
|
147 |
Tweets for context:
|
148 |
{', '.join(all_tweets)}
|
149 |
-
|
150 |
**Only generate the tweet. Do not include analysis, explanation, or any other content.**
|
151 |
"""
|
152 |
-
|
153 |
-
token_count = self.count_tokens(prompt)
|
154 |
-
if token_count > 6000: # Limit to 6000 tokens (adjust as needed)
|
155 |
-
# Further truncate the tweet and topics if token limit is exceeded
|
156 |
-
all_tweets = all_tweets[:3] # Reduce the number of tweets used
|
157 |
-
prompt = f"""Based on this personality profile:
|
158 |
-
{personality_profile_excerpt}
|
159 |
-
Current context or topic (if any):
|
160 |
-
{context}
|
161 |
-
Additionally, consider these contexts to increase diversity:
|
162 |
-
{', '.join(selected_contexts)}
|
163 |
-
|
164 |
-
Tweets for context:
|
165 |
-
{', '.join(all_tweets)}
|
166 |
-
|
167 |
-
**Only generate the tweet. Do not include analysis, explanation, or any other content.**
|
168 |
-
"""
|
169 |
-
|
170 |
try:
|
171 |
response = self.groq_client.chat.completions.create(
|
172 |
messages=[
|
@@ -174,13 +157,11 @@ class TweetDatasetProcessor:
|
|
174 |
{"role": "user", "content": prompt},
|
175 |
],
|
176 |
model="llama-3.1-70b-versatile",
|
177 |
-
temperature=1.0,
|
178 |
max_tokens=150,
|
179 |
)
|
180 |
-
tweet = response.choices[0].message.content
|
181 |
-
|
182 |
-
return tweet.strip().split("\n")[0] # Only return the first line (tweet)
|
183 |
except Exception as e:
|
184 |
print(f"Error generating tweet: {e}")
|
185 |
return "Error generating tweet"
|
186 |
-
|
|
|
1 |
+
import os
|
2 |
from PyPDF2 import PdfReader
|
3 |
import pandas as pd
|
4 |
from dotenv import load_dotenv
|
|
|
7 |
from datetime import datetime
|
8 |
from sklearn.decomposition import NMF
|
9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
10 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
11 |
+
from sklearn.cluster import KMeans
|
12 |
import random
|
13 |
|
14 |
class TweetDatasetProcessor:
|
|
|
17 |
self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
|
18 |
self.tweets = []
|
19 |
self.personality_profile = {}
|
20 |
+
self.vectorizer = TfidfVectorizer(stop_words='english')
|
21 |
|
22 |
def extract_text_from_pdf(self, pdf_path):
|
23 |
"""Extract text content from PDF file."""
|
|
|
63 |
df.to_csv('processed_tweets.csv', index=False)
|
64 |
return df
|
65 |
|
|
|
|
|
|
|
|
|
66 |
def _extract_mentions(self, text):
|
67 |
"""Extract mentioned users from tweet."""
|
68 |
return [word for word in text.split() if word.startswith('@')]
|
|
|
71 |
"""Extract hashtags from tweet."""
|
72 |
return [word for word in text.split() if word.startswith('#')]
|
73 |
|
74 |
+
def categorize_tweets(self):
|
75 |
+
"""Cluster tweets into categories using KMeans."""
|
76 |
+
all_tweets = [tweet['content'] for tweet in self.tweets]
|
77 |
+
tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
|
78 |
+
kmeans = KMeans(n_clusters=5, random_state=1)
|
79 |
+
kmeans.fit(tfidf_matrix)
|
80 |
+
|
81 |
+
for i, tweet in enumerate(self.tweets):
|
82 |
+
tweet['category'] = f"Category {kmeans.labels_[i]}"
|
83 |
+
return pd.DataFrame(self.tweets)
|
84 |
+
|
85 |
def analyze_personality(self):
|
86 |
+
"""Comprehensive personality analysis using all tweets."""
|
87 |
all_tweets = [tweet['content'] for tweet in self.tweets]
|
88 |
+
# Use a broader dataset for a comprehensive profile
|
89 |
+
analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
|
90 |
Core beliefs, emotional tendencies, cognitive patterns, etc.
|
91 |
Tweets for analysis:
|
92 |
+
{json.dumps(all_tweets, indent=2)}
|
93 |
"""
|
94 |
|
95 |
response = self.groq_client.chat.completions.create(
|
|
|
103 |
self.personality_profile = response.choices[0].message.content
|
104 |
return self.personality_profile
|
105 |
|
106 |
+
def analyze_topics(self, n_topics=5):
|
107 |
"""Extract and identify different topics the author has tweeted about."""
|
108 |
all_tweets = [tweet['content'] for tweet in self.tweets]
|
109 |
+
tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
|
|
|
110 |
nmf_model = NMF(n_components=n_topics, random_state=1)
|
111 |
nmf_model.fit(tfidf_matrix)
|
112 |
|
113 |
topics = []
|
114 |
for topic_idx, topic in enumerate(nmf_model.components_):
|
115 |
+
topic_words = [self.vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
|
116 |
topics.append(" ".join(topic_words))
|
117 |
+
return list(set(topics)) # Remove duplicates
|
|
|
|
|
|
|
118 |
|
119 |
def count_tokens(self, text):
|
120 |
"""Estimate the number of tokens in the given text."""
|
|
|
121 |
return len(text.split())
|
122 |
|
123 |
def generate_tweet(self, context=""):
|
124 |
"""Generate a new tweet based on personality profile and optional context."""
|
125 |
+
historical_topics = self.analyze_topics(n_topics=5)
|
|
|
126 |
additional_contexts = historical_topics + [
|
127 |
"Comment on a recent technological advancement.",
|
128 |
"Share a motivational thought.",
|
|
|
130 |
"Reflect on a past experience.",
|
131 |
"Provide advice to followers."
|
132 |
]
|
|
|
|
|
133 |
selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))
|
134 |
|
135 |
+
# Select tweets close to the author's style
|
136 |
+
tfidf_matrix = self.vectorizer.transform([tweet['content'] for tweet in self.tweets])
|
137 |
+
similarity = cosine_similarity(tfidf_matrix)
|
138 |
+
tweet_sample_indices = similarity.sum(axis=1).argsort()[-5:] # Top 5 similar tweets
|
139 |
+
all_tweets = [self.tweets[i]['content'] for i in tweet_sample_indices]
|
140 |
|
141 |
+
personality_profile_excerpt = self.personality_profile[:400]
|
|
|
142 |
|
|
|
143 |
prompt = f"""Based on this personality profile:
|
144 |
{personality_profile_excerpt}
|
145 |
Current context or topic (if any):
|
146 |
{context}
|
147 |
Additionally, consider these contexts to increase diversity:
|
148 |
{', '.join(selected_contexts)}
|
|
|
149 |
Tweets for context:
|
150 |
{', '.join(all_tweets)}
|
|
|
151 |
**Only generate the tweet. Do not include analysis, explanation, or any other content.**
|
152 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
try:
|
154 |
response = self.groq_client.chat.completions.create(
|
155 |
messages=[
|
|
|
157 |
{"role": "user", "content": prompt},
|
158 |
],
|
159 |
model="llama-3.1-70b-versatile",
|
160 |
+
temperature=1.0,
|
161 |
max_tokens=150,
|
162 |
)
|
163 |
+
tweet = response.choices[0].message.content.strip()
|
164 |
+
return tweet
|
|
|
165 |
except Exception as e:
|
166 |
print(f"Error generating tweet: {e}")
|
167 |
return "Error generating tweet"
|
|