Spaces:

Manasa1
/

tweets_clone

Sleeping

App Files Files Community

Manasa1 commited on Dec 15, 2024

Commit

b6a091c

verified ·

1 Parent(s): ac5fd0d

Delete tweet_analyzer.py

Browse files

Files changed (1) hide show

tweet_analyzer.py +0 -130

tweet_analyzer.py DELETED Viewed

@@ -1,130 +0,0 @@
-import os
-from PyPDF2 import PdfReader
-import pandas as pd
-from dotenv import load_dotenv
-import json
-from datetime import datetime
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.cluster import KMeans
-import random
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-class TweetDatasetProcessor:
-    def __init__(self, fine_tuned_model_name):
-        load_dotenv()
-        self.tweets = []
-        self.personality_profile = {}
-        self.vectorizer = TfidfVectorizer(stop_words='english')
-        self.used_tweets = set()  # Track used tweets to avoid repetition
-        # Load fine-tuned model and tokenizer
-        self.model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)
-        self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
-    @staticmethod
-    def _process_line(line):
-        """Process a single line."""
-        line = line.strip()
-        if not line or line.startswith('http'):  # Skip empty lines and URLs
-            return None
-        return {
-            'content': line,
-            'timestamp': datetime.now(),
-            'mentions': [word for word in line.split() if word.startswith('@')],
-            'hashtags': [word for word in line.split() if word.startswith('#')]
-        }
-    def extract_text_from_pdf(self, pdf_path):
-        """Extract text content from PDF file."""
-        reader = PdfReader(pdf_path)
-        text = ""
-        for page in reader.pages:
-            text += page.extract_text()
-        return text
-    def process_pdf_content(self, text):
-        """Process PDF content and clean extracted tweets."""
-        if not text.strip():
-            raise ValueError("The uploaded PDF appears to be empty.")
-        lines = text.split('\n')
-        clean_tweets = [TweetDatasetProcessor._process_line(line) for line in lines]
-        self.tweets = [tweet for tweet in clean_tweets if tweet]
-        if not self.tweets:
-            raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")
-        # Save the processed tweets to a CSV
-        df = pd.DataFrame(self.tweets)
-        df.to_csv('processed_tweets.csv', index=False)
-        return df
-    def categorize_tweets(self):
-        """Cluster tweets into categories using KMeans."""
-        all_tweets = [tweet['content'] for tweet in self.tweets]
-        if not all_tweets:
-            raise ValueError("No tweets available for clustering.")
-        tfidf_matrix = self.vectorizer.fit_transform(all_tweets)
-        kmeans = KMeans(n_clusters=5, random_state=1)
-        kmeans.fit(tfidf_matrix)
-        for i, tweet in enumerate(self.tweets):
-            tweet['category'] = f"Category {kmeans.labels_[i]}"
-        return pd.DataFrame(self.tweets)
-    def analyze_personality(self, max_tweets=50):
-        """Comprehensive personality analysis using a limited subset of tweets."""
-        if not self.tweets:
-            raise ValueError("No tweets available for personality analysis.")
-        all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
-        analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
-        Core beliefs, emotional tendencies, cognitive patterns, etc.
-        Tweets for analysis:
-        {json.dumps(all_tweets, indent=2)}
-        """
-        input_ids = self.tokenizer.encode(analysis_prompt, return_tensors='pt')
-        output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7)
-        personality_analysis = self.tokenizer.decode(output[0], skip_special_tokens=True)
-        self.personality_profile = personality_analysis
-        return self.personality_profile
-    def generate_tweet(self, context="", sample_size=3):
-        """Generate a new tweet by sampling random tweets and avoiding repetition."""
-        if not self.tweets:
-            return "Error: No tweets available for generation."
-        # Randomly sample unique tweets
-        available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
-        if len(available_tweets) < sample_size:
-            self.used_tweets.clear()  # Reset used tweets if all have been used
-            available_tweets = self.tweets
-        sampled_tweets = random.sample(available_tweets, sample_size)
-        sampled_contents = [tweet['content'] for tweet in sampled_tweets]
-        # Update the used tweets tracker
-        self.used_tweets.update(sampled_contents)
-        # Truncate personality profile to avoid token overflow
-        personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile
-        # Construct the prompt
-        prompt = f"""Based on this personality profile:
-        {personality_profile_excerpt}
-        Current context or topic (if any):
-        {context}
-        Tweets for context:
-        {', '.join(sampled_contents)}
-        **Only generate the tweet. Do not include analysis, explanation, or any other content.**
-        """
-        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
-        output = self.model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=1.0)
-        generated_tweet = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
-        return generated_tweet