Manasa1 commited on
Commit
3f9241e
·
verified ·
1 Parent(s): 60b3b65

Update tweet_analyzer.py

Browse files
Files changed (1) hide show
  1. tweet_analyzer.py +33 -52
tweet_analyzer.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
2
  from PyPDF2 import PdfReader
3
  import pandas as pd
4
  from dotenv import load_dotenv
@@ -7,6 +7,8 @@ import json
7
  from datetime import datetime
8
  from sklearn.decomposition import NMF
9
  from sklearn.feature_extraction.text import TfidfVectorizer
 
 
10
  import random
11
 
12
  class TweetDatasetProcessor:
@@ -15,6 +17,7 @@ class TweetDatasetProcessor:
15
  self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
16
  self.tweets = []
17
  self.personality_profile = {}
 
18
 
19
  def extract_text_from_pdf(self, pdf_path):
20
  """Extract text content from PDF file."""
@@ -60,10 +63,6 @@ class TweetDatasetProcessor:
60
  df.to_csv('processed_tweets.csv', index=False)
61
  return df
62
 
63
- def _extract_timestamp(self, text):
64
- """Extract timestamp if present in tweet."""
65
- return None # Implement timestamp extraction logic if needed
66
-
67
  def _extract_mentions(self, text):
68
  """Extract mentioned users from tweet."""
69
  return [word for word in text.split() if word.startswith('@')]
@@ -72,13 +71,25 @@ class TweetDatasetProcessor:
72
  """Extract hashtags from tweet."""
73
  return [word for word in text.split() if word.startswith('#')]
74
 
 
 
 
 
 
 
 
 
 
 
 
75
  def analyze_personality(self):
76
- """Comprehensive personality analysis."""
77
  all_tweets = [tweet['content'] for tweet in self.tweets]
78
- analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
 
79
  Core beliefs, emotional tendencies, cognitive patterns, etc.
80
  Tweets for analysis:
81
- {json.dumps(all_tweets[:5], indent=2)} # Further reduced number of tweets
82
  """
83
 
84
  response = self.groq_client.chat.completions.create(
@@ -92,32 +103,26 @@ class TweetDatasetProcessor:
92
  self.personality_profile = response.choices[0].message.content
93
  return self.personality_profile
94
 
95
- def analyze_topics(self, n_topics=3): # Reduced the number of topics
96
  """Extract and identify different topics the author has tweeted about."""
97
  all_tweets = [tweet['content'] for tweet in self.tweets]
98
- vectorizer = TfidfVectorizer(stop_words='english')
99
- tfidf_matrix = vectorizer.fit_transform(all_tweets)
100
  nmf_model = NMF(n_components=n_topics, random_state=1)
101
  nmf_model.fit(tfidf_matrix)
102
 
103
  topics = []
104
  for topic_idx, topic in enumerate(nmf_model.components_):
105
- topic_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
106
  topics.append(" ".join(topic_words))
107
-
108
- # Remove duplicates in topics
109
- topics = list(set(topics))
110
- return topics
111
 
112
  def count_tokens(self, text):
113
  """Estimate the number of tokens in the given text."""
114
- # A basic token count estimation (approximate)
115
  return len(text.split())
116
 
117
  def generate_tweet(self, context=""):
118
  """Generate a new tweet based on personality profile and optional context."""
119
- # Extract historical topics and add them to additional contexts
120
- historical_topics = self.analyze_topics(n_topics=3) # Reduced number of topics
121
  additional_contexts = historical_topics + [
122
  "Comment on a recent technological advancement.",
123
  "Share a motivational thought.",
@@ -125,48 +130,26 @@ class TweetDatasetProcessor:
125
  "Reflect on a past experience.",
126
  "Provide advice to followers."
127
  ]
128
-
129
- # Randomly select multiple contexts to increase diversity
130
  selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))
131
 
132
- # Randomly sample tweets across different time periods to avoid repetition of topics
133
- tweet_sample = random.sample(self.tweets, min(5, len(self.tweets))) # Further reduced number of tweets
134
- all_tweets = [tweet['content'] for tweet in tweet_sample]
 
 
135
 
136
- # If personality profile is too long, truncate it (adjust length as needed)
137
- personality_profile_excerpt = self.personality_profile[:400] # Further truncation
138
 
139
- # Combine everything and check token count
140
  prompt = f"""Based on this personality profile:
141
  {personality_profile_excerpt}
142
  Current context or topic (if any):
143
  {context}
144
  Additionally, consider these contexts to increase diversity:
145
  {', '.join(selected_contexts)}
146
-
147
  Tweets for context:
148
  {', '.join(all_tweets)}
149
-
150
  **Only generate the tweet. Do not include analysis, explanation, or any other content.**
151
  """
152
-
153
- token_count = self.count_tokens(prompt)
154
- if token_count > 6000: # Limit to 6000 tokens (adjust as needed)
155
- # Further truncate the tweet and topics if token limit is exceeded
156
- all_tweets = all_tweets[:3] # Reduce the number of tweets used
157
- prompt = f"""Based on this personality profile:
158
- {personality_profile_excerpt}
159
- Current context or topic (if any):
160
- {context}
161
- Additionally, consider these contexts to increase diversity:
162
- {', '.join(selected_contexts)}
163
-
164
- Tweets for context:
165
- {', '.join(all_tweets)}
166
-
167
- **Only generate the tweet. Do not include analysis, explanation, or any other content.**
168
- """
169
-
170
  try:
171
  response = self.groq_client.chat.completions.create(
172
  messages=[
@@ -174,13 +157,11 @@ class TweetDatasetProcessor:
174
  {"role": "user", "content": prompt},
175
  ],
176
  model="llama-3.1-70b-versatile",
177
- temperature=1.0, # Increased temperature for more diversity
178
  max_tokens=150,
179
  )
180
- tweet = response.choices[0].message.content
181
- # Ensure the response only contains the tweet text, and nothing else.
182
- return tweet.strip().split("\n")[0] # Only return the first line (tweet)
183
  except Exception as e:
184
  print(f"Error generating tweet: {e}")
185
  return "Error generating tweet"
186
-
 
1
+ import os
2
  from PyPDF2 import PdfReader
3
  import pandas as pd
4
  from dotenv import load_dotenv
 
7
  from datetime import datetime
8
  from sklearn.decomposition import NMF
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+ from sklearn.cluster import KMeans
12
  import random
13
 
14
  class TweetDatasetProcessor:
 
17
  self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
18
  self.tweets = []
19
  self.personality_profile = {}
20
+ self.vectorizer = TfidfVectorizer(stop_words='english')
21
 
22
  def extract_text_from_pdf(self, pdf_path):
23
  """Extract text content from PDF file."""
 
63
  df.to_csv('processed_tweets.csv', index=False)
64
  return df
65
 
 
 
 
 
66
  def _extract_mentions(self, text):
67
  """Extract mentioned users from tweet."""
68
  return [word for word in text.split() if word.startswith('@')]
 
71
  """Extract hashtags from tweet."""
72
  return [word for word in text.split() if word.startswith('#')]
73
 
74
+ def categorize_tweets(self):
75
+ """Cluster tweets into categories using KMeans."""
76
+ all_tweets = [tweet['content'] for tweet in self.tweets]
77
+ tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
78
+ kmeans = KMeans(n_clusters=5, random_state=1)
79
+ kmeans.fit(tfidf_matrix)
80
+
81
+ for i, tweet in enumerate(self.tweets):
82
+ tweet['category'] = f"Category {kmeans.labels_[i]}"
83
+ return pd.DataFrame(self.tweets)
84
+
85
  def analyze_personality(self):
86
+ """Comprehensive personality analysis using all tweets."""
87
  all_tweets = [tweet['content'] for tweet in self.tweets]
88
+ # Use a broader dataset for a comprehensive profile
89
+ analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
90
  Core beliefs, emotional tendencies, cognitive patterns, etc.
91
  Tweets for analysis:
92
+ {json.dumps(all_tweets, indent=2)}
93
  """
94
 
95
  response = self.groq_client.chat.completions.create(
 
103
  self.personality_profile = response.choices[0].message.content
104
  return self.personality_profile
105
 
106
+ def analyze_topics(self, n_topics=5):
107
  """Extract and identify different topics the author has tweeted about."""
108
  all_tweets = [tweet['content'] for tweet in self.tweets]
109
+ tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
 
110
  nmf_model = NMF(n_components=n_topics, random_state=1)
111
  nmf_model.fit(tfidf_matrix)
112
 
113
  topics = []
114
  for topic_idx, topic in enumerate(nmf_model.components_):
115
+ topic_words = [self.vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
116
  topics.append(" ".join(topic_words))
117
+ return list(set(topics)) # Remove duplicates
 
 
 
118
 
119
  def count_tokens(self, text):
120
  """Estimate the number of tokens in the given text."""
 
121
  return len(text.split())
122
 
123
  def generate_tweet(self, context=""):
124
  """Generate a new tweet based on personality profile and optional context."""
125
+ historical_topics = self.analyze_topics(n_topics=5)
 
126
  additional_contexts = historical_topics + [
127
  "Comment on a recent technological advancement.",
128
  "Share a motivational thought.",
 
130
  "Reflect on a past experience.",
131
  "Provide advice to followers."
132
  ]
 
 
133
  selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))
134
 
135
+ # Select tweets close to the author's style
136
+ tfidf_matrix = self.vectorizer.transform([tweet['content'] for tweet in self.tweets])
137
+ similarity = cosine_similarity(tfidf_matrix)
138
+ tweet_sample_indices = similarity.sum(axis=1).argsort()[-5:] # Top 5 similar tweets
139
+ all_tweets = [self.tweets[i]['content'] for i in tweet_sample_indices]
140
 
141
+ personality_profile_excerpt = self.personality_profile[:400]
 
142
 
 
143
  prompt = f"""Based on this personality profile:
144
  {personality_profile_excerpt}
145
  Current context or topic (if any):
146
  {context}
147
  Additionally, consider these contexts to increase diversity:
148
  {', '.join(selected_contexts)}
 
149
  Tweets for context:
150
  {', '.join(all_tweets)}
 
151
  **Only generate the tweet. Do not include analysis, explanation, or any other content.**
152
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  try:
154
  response = self.groq_client.chat.completions.create(
155
  messages=[
 
157
  {"role": "user", "content": prompt},
158
  ],
159
  model="llama-3.1-70b-versatile",
160
+ temperature=1.0,
161
  max_tokens=150,
162
  )
163
+ tweet = response.choices[0].message.content.strip()
164
+ return tweet
 
165
  except Exception as e:
166
  print(f"Error generating tweet: {e}")
167
  return "Error generating tweet"