Spaces:

Manasa1
/

Jack_Clone

Sleeping

App Files Files Community

Manasa1 commited on Nov 22, 2024

Commit

3f9241e

verified ·

1 Parent(s): 60b3b65

Update tweet_analyzer.py

Browse files

Files changed (1) hide show

tweet_analyzer.py +33 -52

tweet_analyzer.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os
 from PyPDF2 import PdfReader
 import pandas as pd
 from dotenv import load_dotenv
@@ -7,6 +7,8 @@ import json
 from datetime import datetime
 from sklearn.decomposition import NMF
 from sklearn.feature_extraction.text import TfidfVectorizer
 import random
 class TweetDatasetProcessor:
@@ -15,6 +17,7 @@ class TweetDatasetProcessor:
         self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
         self.tweets = []
         self.personality_profile = {}
     def extract_text_from_pdf(self, pdf_path):
         """Extract text content from PDF file."""
@@ -60,10 +63,6 @@ class TweetDatasetProcessor:
         df.to_csv('processed_tweets.csv', index=False)
         return df
-    def _extract_timestamp(self, text):
-        """Extract timestamp if present in tweet."""
-        return None  # Implement timestamp extraction logic if needed
     def _extract_mentions(self, text):
         """Extract mentioned users from tweet."""
         return [word for word in text.split() if word.startswith('@')]
@@ -72,13 +71,25 @@ class TweetDatasetProcessor:
         """Extract hashtags from tweet."""
         return [word for word in text.split() if word.startswith('#')]
     def analyze_personality(self):
-        """Comprehensive personality analysis."""
         all_tweets = [tweet['content'] for tweet in self.tweets]
-        analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
         Core beliefs, emotional tendencies, cognitive patterns, etc.
         Tweets for analysis:
-        {json.dumps(all_tweets[:5], indent=2)}  # Further reduced number of tweets
         """
         response = self.groq_client.chat.completions.create(
@@ -92,32 +103,26 @@ class TweetDatasetProcessor:
         self.personality_profile = response.choices[0].message.content
         return self.personality_profile
-    def analyze_topics(self, n_topics=3):  # Reduced the number of topics
         """Extract and identify different topics the author has tweeted about."""
         all_tweets = [tweet['content'] for tweet in self.tweets]
-        vectorizer = TfidfVectorizer(stop_words='english')
-        tfidf_matrix = vectorizer.fit_transform(all_tweets)
         nmf_model = NMF(n_components=n_topics, random_state=1)
         nmf_model.fit(tfidf_matrix)
         topics = []
         for topic_idx, topic in enumerate(nmf_model.components_):
-            topic_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
             topics.append(" ".join(topic_words))
-        # Remove duplicates in topics
-        topics = list(set(topics))
-        return topics
     def count_tokens(self, text):
         """Estimate the number of tokens in the given text."""
-        # A basic token count estimation (approximate)
         return len(text.split())
     def generate_tweet(self, context=""):
         """Generate a new tweet based on personality profile and optional context."""
-        # Extract historical topics and add them to additional contexts
-        historical_topics = self.analyze_topics(n_topics=3)  # Reduced number of topics
         additional_contexts = historical_topics + [
             "Comment on a recent technological advancement.",
             "Share a motivational thought.",
@@ -125,48 +130,26 @@ class TweetDatasetProcessor:
             "Reflect on a past experience.",
             "Provide advice to followers."
         ]
-        # Randomly select multiple contexts to increase diversity
         selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))
-        # Randomly sample tweets across different time periods to avoid repetition of topics
-        tweet_sample = random.sample(self.tweets, min(5, len(self.tweets)))  # Further reduced number of tweets
-        all_tweets = [tweet['content'] for tweet in tweet_sample]
-        # If personality profile is too long, truncate it (adjust length as needed)
-        personality_profile_excerpt = self.personality_profile[:400]  # Further truncation
-        # Combine everything and check token count
         prompt = f"""Based on this personality profile:
         {personality_profile_excerpt}
         Current context or topic (if any):
         {context}
         Additionally, consider these contexts to increase diversity:
         {', '.join(selected_contexts)}
         Tweets for context:
         {', '.join(all_tweets)}
         **Only generate the tweet. Do not include analysis, explanation, or any other content.**
         """
-        token_count = self.count_tokens(prompt)
-        if token_count > 6000:  # Limit to 6000 tokens (adjust as needed)
-            # Further truncate the tweet and topics if token limit is exceeded
-            all_tweets = all_tweets[:3]  # Reduce the number of tweets used
-            prompt = f"""Based on this personality profile:
-            {personality_profile_excerpt}
-            Current context or topic (if any):
-            {context}
-            Additionally, consider these contexts to increase diversity:
-            {', '.join(selected_contexts)}
-            Tweets for context:
-            {', '.join(all_tweets)}
-            **Only generate the tweet. Do not include analysis, explanation, or any other content.**
-            """
         try:
             response = self.groq_client.chat.completions.create(
                 messages=[
@@ -174,13 +157,11 @@ class TweetDatasetProcessor:
                     {"role": "user", "content": prompt},
                 ],
                 model="llama-3.1-70b-versatile",
-                temperature=1.0,  # Increased temperature for more diversity
                 max_tokens=150,
             )
-            tweet = response.choices[0].message.content
-            # Ensure the response only contains the tweet text, and nothing else.
-            return tweet.strip().split("\n")[0]  # Only return the first line (tweet)
         except Exception as e:
             print(f"Error generating tweet: {e}")
             return "Error generating tweet"

+import os
 from PyPDF2 import PdfReader
 import pandas as pd
 from dotenv import load_dotenv
 from datetime import datetime
 from sklearn.decomposition import NMF
 from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.cluster import KMeans
 import random
 class TweetDatasetProcessor:
         self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
         self.tweets = []
         self.personality_profile = {}
+        self.vectorizer = TfidfVectorizer(stop_words='english')
     def extract_text_from_pdf(self, pdf_path):
         """Extract text content from PDF file."""
         df.to_csv('processed_tweets.csv', index=False)
         return df
     def _extract_mentions(self, text):
         """Extract mentioned users from tweet."""
         return [word for word in text.split() if word.startswith('@')]
         """Extract hashtags from tweet."""
         return [word for word in text.split() if word.startswith('#')]
+    def categorize_tweets(self):
+        """Cluster tweets into categories using KMeans."""
+        all_tweets = [tweet['content'] for tweet in self.tweets]
+        tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
+        kmeans = KMeans(n_clusters=5, random_state=1)
+        kmeans.fit(tfidf_matrix)
+        for i, tweet in enumerate(self.tweets):
+            tweet['category'] = f"Category {kmeans.labels_[i]}"
+        return pd.DataFrame(self.tweets)
     def analyze_personality(self):
+        """Comprehensive personality analysis using all tweets."""
         all_tweets = [tweet['content'] for tweet in self.tweets]
+        # Use a broader dataset for a comprehensive profile
+        analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
         Core beliefs, emotional tendencies, cognitive patterns, etc.
         Tweets for analysis:
+        {json.dumps(all_tweets, indent=2)}
         """
         response = self.groq_client.chat.completions.create(
         self.personality_profile = response.choices[0].message.content
         return self.personality_profile
+    def analyze_topics(self, n_topics=5):
         """Extract and identify different topics the author has tweeted about."""
         all_tweets = [tweet['content'] for tweet in self.tweets]
+        tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
         nmf_model = NMF(n_components=n_topics, random_state=1)
         nmf_model.fit(tfidf_matrix)
         topics = []
         for topic_idx, topic in enumerate(nmf_model.components_):
+            topic_words = [self.vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
             topics.append(" ".join(topic_words))
+        return list(set(topics))  # Remove duplicates
     def count_tokens(self, text):
         """Estimate the number of tokens in the given text."""
         return len(text.split())
     def generate_tweet(self, context=""):
         """Generate a new tweet based on personality profile and optional context."""
+        historical_topics = self.analyze_topics(n_topics=5)
         additional_contexts = historical_topics + [
             "Comment on a recent technological advancement.",
             "Share a motivational thought.",
             "Reflect on a past experience.",
             "Provide advice to followers."
         ]
         selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))
+        # Select tweets close to the author's style
+        tfidf_matrix = self.vectorizer.transform([tweet['content'] for tweet in self.tweets])
+        similarity = cosine_similarity(tfidf_matrix)
+        tweet_sample_indices = similarity.sum(axis=1).argsort()[-5:]  # Top 5 similar tweets
+        all_tweets = [self.tweets[i]['content'] for i in tweet_sample_indices]
+        personality_profile_excerpt = self.personality_profile[:400]
         prompt = f"""Based on this personality profile:
         {personality_profile_excerpt}
         Current context or topic (if any):
         {context}
         Additionally, consider these contexts to increase diversity:
         {', '.join(selected_contexts)}
         Tweets for context:
         {', '.join(all_tweets)}
         **Only generate the tweet. Do not include analysis, explanation, or any other content.**
         """
         try:
             response = self.groq_client.chat.completions.create(
                 messages=[
                     {"role": "user", "content": prompt},
                 ],
                 model="llama-3.1-70b-versatile",
+                temperature=1.0,
                 max_tokens=150,
             )
+            tweet = response.choices[0].message.content.strip()
+            return tweet
         except Exception as e:
             print(f"Error generating tweet: {e}")
             return "Error generating tweet"