import os from PyPDF2 import PdfReader import pandas as pd from dotenv import load_dotenv import groq import json from datetime import datetime class TweetDatasetProcessor: def __init__(self): load_dotenv() self.groq_client = groq.Groq(api_key=os.getenv('Groq_api')) self.tweets = [] self.personality_profile = {} def extract_text_from_pdf(self, pdf_path): """Extract text content from PDF file""" reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() return text def process_pdf_content(self, text): """Process PDF content and extract tweets with metadata""" lines = text.split('\n') for line in lines: if line.strip(): self.tweets.append({ 'content': line.strip(), 'timestamp': self._extract_timestamp(line) if self._extract_timestamp(line) else datetime.now(), 'mentions': self._extract_mentions(line), 'hashtags': self._extract_hashtags(line) }) df = pd.DataFrame(self.tweets) df.to_csv('processed_tweets.csv', index=False) return df def _extract_timestamp(self, text): """Extract timestamp if present in tweet""" # Implement timestamp extraction logic return None def _extract_mentions(self, text): """Extract mentioned users from tweet""" return [word for word in text.split() if word.startswith('@')] def _extract_hashtags(self, text): """Extract hashtags from tweet""" return [word for word in text.split() if word.startswith('#')] def analyze_personality(self): """Comprehensive personality analysis""" all_tweets = [tweet['content'] for tweet in self.tweets] analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze: 1. Core Beliefs and Values: - What fundamental beliefs shape their worldview? - What causes or issues do they care about? 2. Cognitive Patterns: - How do they process information? - What decision-making patterns are visible? 3. Emotional Tendencies: - What triggers emotional responses? - How do they express emotions? 4. Social Interaction Style: - How do they engage with others? - What relationship patterns emerge? 5. Knowledge Areas: - What topics do they discuss with expertise? - What experiences do they draw from? 6. Communication Style: - Vocabulary preferences - Rhetorical patterns - Humor style 7. Behavioral Patterns: - Daily routines mentioned - Regular activities - Habits and preferences Tweets for analysis: {json.dumps(all_tweets[:30], indent=2)} """ response = self.groq_client.chat.completions.create( messages=[ { "role": "system", "content": "You are an expert psychologist specializing in personality analysis through written communication." }, { "role": "user", "content": analysis_prompt } ], model="mixtral-8x7b-32768", temperature=0.1, ) self.personality_profile = response.choices[0].message.content return self.personality_profile def generate_tweet(self, context=""): """Generate a new tweet based on personality profile and optional context""" generation_prompt = f"""Based on this personality profile: {self.personality_profile} Current context or topic (if any): {context} Generate a tweet that this person would write right now. Consider: 1. Their core beliefs and values 2. Their typical emotional expression 3. Their communication style and vocabulary 4. Their knowledge areas and experiences 5. Current context (if provided) The tweet should feel indistinguishable from their authentic tweets. """ response = self.groq_client.chat.completions.create( messages=[ { "role": "system", "content": "You are an expert in replicating individual writing and thinking patterns." }, { "role": "user", "content": generation_prompt } ], model="mixtral-8x7b-32768", temperature=0.7, ) return response.choices[0].message.content