File size: 6,526 Bytes
4c87df2
559513f
 
 
 
 
 
391392f
 
7367aa3
559513f
 
 
 
2317b49
559513f
 
 
 
39904a2
559513f
 
 
 
 
 
 
39904a2
559513f
39904a2
 
 
559513f
39904a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559513f
 
 
 
 
39904a2
391392f
559513f
 
39904a2
559513f
 
 
39904a2
559513f
 
 
39904a2
559513f
 
391392f
559513f
10f6a71
559513f
39904a2
2d11b96
 
 
 
 
 
 
 
 
559513f
 
10f6a71
39904a2
391392f
 
 
 
 
39904a2
391392f
 
 
 
7367aa3
39904a2
 
391392f
3b03cca
559513f
39904a2
2abd5aa
 
 
 
 
 
 
3b03cca
dac9332
10f6a71
3b03cca
dac9332
 
 
 
52b9e07
10f6a71
52b9e07
 
 
10f6a71
52b9e07
559513f
52b9e07
559513f
 
dac9332
 
2d11b96
 
 
39904a2
4c87df2
 
 
 
 
 
 
 
 
 
2d11b96
 
 
4c87df2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os 
from PyPDF2 import PdfReader
import pandas as pd
from dotenv import load_dotenv
import groq
import json
from datetime import datetime
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import random

class TweetDatasetProcessor:
    def __init__(self):
        load_dotenv()
        self.groq_client = groq.Groq(api_key=os.getenv('Groq_api'))
        self.tweets = []
        self.personality_profile = {}

    def extract_text_from_pdf(self, pdf_path):
        """Extract text content from PDF file."""
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

    def process_pdf_content(self, text):
        """Process PDF content and clean extracted tweets."""
        lines = text.split('\n')
        clean_tweets = []
        buffer = ""

        for line in lines:
            line = line.strip()
            if not line:
                if buffer:  # End of a tweet
                    clean_tweets.append(buffer.strip())
                    buffer = ""
            elif line.startswith('http'):  # Skip URLs
                continue
            else:
                buffer += " " + line  # Append lines to form complete tweets

        if buffer:  # Add the last tweet
            clean_tweets.append(buffer.strip())

        # Build the tweet list with metadata
        self.tweets = [
            {
                'content': tweet,
                'timestamp': datetime.now(),  # Assign dummy timestamp
                'mentions': self._extract_mentions(tweet),
                'hashtags': self._extract_hashtags(tweet)
            }
            for tweet in clean_tweets
        ]

        # Save the processed tweets to a CSV
        df = pd.DataFrame(self.tweets)
        df.to_csv('processed_tweets.csv', index=False)
        return df

    def _extract_timestamp(self, text):
        """Extract timestamp if present in tweet."""
        return None  # Implement timestamp extraction logic if needed

    def _extract_mentions(self, text):
        """Extract mentioned users from tweet."""
        return [word for word in text.split() if word.startswith('@')]

    def _extract_hashtags(self, text):
        """Extract hashtags from tweet."""
        return [word for word in text.split() if word.startswith('#')]

    def analyze_personality(self):
        """Comprehensive personality analysis."""
        all_tweets = [tweet['content'] for tweet in self.tweets]
        analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets. Analyze:
        Core beliefs, emotional tendencies, cognitive patterns, etc.
        Tweets for analysis:
        {json.dumps(all_tweets[:20], indent=2)}  # Reduce the number of tweets analyzed
        """

        response = self.groq_client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are an expert psychologist."},
                {"role": "user", "content": analysis_prompt},
            ],
            model="llama-3.1-70b-versatile",
            temperature=0.1,
        )
        self.personality_profile = response.choices[0].message.content
        return self.personality_profile

    def analyze_topics(self, n_topics=3):  # Reduce the number of topics
        """Extract and identify different topics the author has tweeted about."""
        all_tweets = [tweet['content'] for tweet in self.tweets]
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(all_tweets)
        nmf_model = NMF(n_components=n_topics, random_state=1)
        nmf_model.fit(tfidf_matrix)

        topics = []
        for topic_idx, topic in enumerate(nmf_model.components_):
            topic_words = [vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_topics - 1:-1]]
            topics.append(" ".join(topic_words))

        # Remove duplicates in topics
        topics = list(set(topics))
        return topics

    def generate_tweet(self, context=""):
        """Generate a new tweet based on personality profile and optional context."""
        additional_contexts = [
            "Comment on a recent technological advancement.",
            "Share a motivational thought.",
            "Discuss a current trending topic.",
            "Reflect on a past experience.",
            "Provide advice to followers."
        ]

        # Extract historical topics and add them to additional contexts
        historical_topics = self.analyze_topics(n_topics=3)  # Reduced number of topics
        additional_contexts.extend(historical_topics)

        # Randomly select multiple contexts to increase diversity
        selected_contexts = random.sample(additional_contexts, min(3, len(additional_contexts)))

        # Randomly sample tweets across different time periods to avoid repetition of topics
        tweet_sample = random.sample(self.tweets, min(20, len(self.tweets)))  # Reduce the number of tweets sampled
        all_tweets = [tweet['content'] for tweet in tweet_sample]

        # If personality profile is too long, truncate it (adjust length as needed)
        personality_profile_excerpt = self.personality_profile[:500]  # Truncate further

        generation_prompt = f"""Based on this personality profile:
        {personality_profile_excerpt}
        Current context or topic (if any):
        {context}
        Additionally, consider these contexts to increase diversity:
        {', '.join(selected_contexts)}

        **Only generate the tweet. Do not include analysis, explanation, or any other content.**
        """

        try:
            response = self.groq_client.chat.completions.create(
                messages=[
                    {"role": "system", "content": "You are an expert in replicating writing and thinking patterns."},
                    {"role": "user", "content": generation_prompt},
                ],
                model="llama-3.1-70b-versatile",
                temperature=1.0,  # Increased temperature for more diversity
                max_tokens=150,
            )
            tweet = response.choices[0].message.content
            # Ensure the response only contains the tweet text, and nothing else.
            return tweet.strip().split("\n")[0]  # Only return the first line (tweet)
        except Exception as e:
            print(f"Error generating tweet: {e}")
            return "Error generating tweet"