Spaces:

Manasa1
/

tweets_clone

Sleeping

File size: 5,750 Bytes

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import random
from datetime import datetime
from PyPDF2 import PdfReader
import json
from dotenv import load_dotenv

load_dotenv()

class TweetDatasetProcessor:
    def __init__(self, fine_tuned_model_name, pdf_path):
        self.tweets = []
        self.personality_profile = {}
        self.vectorizer = None  # No need for vectorizer here since we're not clustering
        self.used_tweets = set()  # Track used tweets to avoid repetition
        self.pdf_path = pdf_path

        # Load fine-tuned model and tokenizer
        self.model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)

    @staticmethod
    def _process_line(line):
        """Process a single line.""" 
        line = line.strip()
        if not line or line.startswith('http'):  # Skip empty lines and URLs
            return None
        return {
            'content': line,
            'timestamp': datetime.now(),
            'mentions': [word for word in line.split() if word.startswith('@')],
            'hashtags': [word for word in line.split() if word.startswith('#')]
        }

    def extract_text_from_pdf(self):
        """Extract text content from PDF file.""" 
        reader = PdfReader(self.pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

    def process_pdf_content(self, text):
        """Process PDF content and clean extracted tweets.""" 
        if not text.strip():
            raise ValueError("The provided PDF appears to be empty.")
        
        lines = text.split('\n')
        clean_tweets = [TweetDatasetProcessor._process_line(line) for line in lines]
        self.tweets = [tweet for tweet in clean_tweets if tweet]

        if not self.tweets:
            raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")

        return self.tweets

    def analyze_personality(self, max_tweets=50):
        """Comprehensive personality analysis using a limited subset of tweets.""" 
        if not self.tweets:
            raise ValueError("No tweets available for personality analysis.")

        all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
        analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
        Core beliefs, emotional tendencies, cognitive patterns, etc.
        Tweets for analysis:
        {json.dumps(all_tweets, indent=2)}
        """

        input_ids = self.tokenizer.encode(analysis_prompt, return_tensors='pt')
        output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7)
        personality_analysis = self.tokenizer.decode(output[0], skip_special_tokens=True)

        self.personality_profile = personality_analysis
        return self.personality_profile

    def generate_tweet(self, context="", sample_size=3):
        """Generate a new tweet by sampling random tweets and avoiding repetition.""" 
        if not self.tweets:
            return "Error: No tweets available for generation."

        # Randomly sample unique tweets
        available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
        if len(available_tweets) < sample_size:
            self.used_tweets.clear()  # Reset used tweets if all have been used
            available_tweets = self.tweets

        sampled_tweets = random.sample(available_tweets, sample_size)
        sampled_contents = [tweet['content'] for tweet in sampled_tweets]

        # Update the used tweets tracker
        self.used_tweets.update(sampled_contents)

        # Truncate personality profile to avoid token overflow
        personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile

        # Construct the prompt
        prompt = f"""Based on this personality profile:
        {personality_profile_excerpt}
        Current context or topic (if any):
        {context}
        Tweets for context:
        {', '.join(sampled_contents)}
        **Only generate the tweet. Do not include analysis, explanation, or any other content.**
        """

        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
        output = self.model.generate(input_ids, max_length=150, num_return_sequences=1, temperature=1.0)
        generated_tweet = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
        
        return generated_tweet

# Gradio Interface Function
def gradio_interface():
    # Path to the PDF with tweets
    pdf_path = 'Dataset (4).pdf'  # Replace with your PDF file path
    fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets'  # Replace with the path to your fine-tuned model

    processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path)

    text = processor.extract_text_from_pdf()
    tweets = processor.process_pdf_content(text)
    personality_analysis = processor.analyze_personality(max_tweets=50)
    generated_tweet = processor.generate_tweet(context="AI-powered tweet generation", sample_size=3)

    return personality_analysis, generated_tweet

# Gradio app setup
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[],
    outputs=[
        gr.Textbox(label="Personality Analysis"),
        gr.Textbox(label="Generated Tweet")
    ],
    live=True,
    title="AI Personality and Tweet Generation",
    description="Automatically analyze personality and generate tweets based on a provided PDF of tweets."
)

# Launch the app
if __name__ == "__main__":
    iface.launch()