Manasa1 commited on
Commit
07099e3
·
verified ·
1 Parent(s): b6a091c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -132
app.py CHANGED
@@ -4,139 +4,39 @@ import random
4
  from datetime import datetime
5
  from PyPDF2 import PdfReader
6
  import json
7
- from dotenv import load_dotenv
8
-
9
- load_dotenv()
10
-
11
- class TweetDatasetProcessor:
12
- def __init__(self, fine_tuned_model_name, pdf_path):
13
- self.tweets = []
14
- self.personality_profile = {}
15
- self.vectorizer = None # No need for vectorizer here since we're not clustering
16
- self.used_tweets = set() # Track used tweets to avoid repetition
17
- self.pdf_path = pdf_path
18
-
19
- # Load fine-tuned model and tokenizer
20
- self.model = AutoModelForCausalLM.from_pretrained(fine_tuned_model_name)
21
- self.tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_name)
22
-
23
- @staticmethod
24
- def _process_line(line):
25
- """Process a single line."""
26
- line = line.strip()
27
- if not line or line.startswith('http'): # Skip empty lines and URLs
28
- return None
29
- # Clean any unwanted characters and fix broken words
30
- line = line.replace('\u201c', '"').replace('\u201d', '"') # Replacing smart quotes
31
- return {
32
- 'content': line,
33
- 'timestamp': datetime.now(),
34
- 'mentions': [word for word in line.split() if word.startswith('@')],
35
- 'hashtags': [word for word in line.split() if word.startswith('#')]
36
- }
37
-
38
- def extract_text_from_pdf(self):
39
- """Extract text content from PDF file."""
40
- reader = PdfReader(self.pdf_path)
41
- text = ""
42
- for page in reader.pages:
43
- text += page.extract_text()
44
- return text
45
-
46
- def process_pdf_content(self, text):
47
- """Process PDF content and clean extracted tweets."""
48
- if not text.strip():
49
- raise ValueError("The provided PDF appears to be empty.")
50
-
51
- lines = text.split('\n')
52
- clean_tweets = [TweetDatasetProcessor._process_line(line) for line in lines]
53
- self.tweets = [tweet for tweet in clean_tweets if tweet]
54
-
55
- if not self.tweets:
56
- raise ValueError("No tweets were extracted from the PDF. Ensure the content is properly formatted.")
57
-
58
- return self.tweets
59
-
60
- def analyze_personality(self, max_tweets=50):
61
- """Comprehensive personality analysis using a limited subset of tweets."""
62
- if not self.tweets:
63
- raise ValueError("No tweets available for personality analysis.")
64
-
65
- all_tweets = [tweet['content'] for tweet in self.tweets][:max_tweets]
66
- analysis_prompt = f"""Perform a deep psychological analysis of the author based on these tweets:
67
- Core beliefs, emotional tendencies, cognitive patterns, etc.
68
- Tweets for analysis:
69
- {json.dumps(all_tweets, indent=2)}
70
- """
71
-
72
- input_ids = self.tokenizer.encode(analysis_prompt, return_tensors='pt')
73
- output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=0.7)
74
- personality_analysis = self.tokenizer.decode(output[0], skip_special_tokens=True)
75
-
76
- self.personality_profile = personality_analysis
77
- return self.personality_profile
78
-
79
- def generate_tweet(self, context="", sample_size=3):
80
- """Generate a new tweet by sampling random tweets and avoiding repetition."""
81
- if not self.tweets:
82
- return "Error: No tweets available for generation."
83
-
84
- # Randomly sample unique tweets
85
- available_tweets = [tweet for tweet in self.tweets if tweet['content'] not in self.used_tweets]
86
- if len(available_tweets) < sample_size:
87
- self.used_tweets.clear() # Reset used tweets if all have been used
88
- available_tweets = self.tweets
89
-
90
- sampled_tweets = random.sample(available_tweets, sample_size)
91
- sampled_contents = [tweet['content'] for tweet in sampled_tweets]
92
-
93
- # Update the used tweets tracker
94
- self.used_tweets.update(sampled_contents)
95
-
96
- # Truncate personality profile to avoid token overflow
97
- personality_profile_excerpt = self.personality_profile[:400] if len(self.personality_profile) > 400 else self.personality_profile
98
-
99
- # Construct the prompt
100
- prompt = f"""Based on this personality profile:
101
- {personality_profile_excerpt}
102
- Current context or topic (if any):
103
- {context}
104
- Tweets for context:
105
- {', '.join(sampled_contents)}
106
- **Only generate the tweet. Do not include analysis, explanation, or any other content.**
107
- """
108
-
109
- input_ids = self.tokenizer.encode(prompt, return_tensors='pt', max_length=1024, truncation=True)
110
- output = self.model.generate(input_ids, max_length=500, num_return_sequences=1, temperature=1.0)
111
- generated_tweet = self.tokenizer.decode(output[0], skip_special_tokens=True).strip()
112
-
113
- return generated_tweet
114
-
115
- # Gradio Interface Function
116
- def gradio_interface():
117
- # Path to the PDF with tweets
118
- pdf_path = 'Dataset (4).pdf' # Replace with your PDF file path
119
- fine_tuned_model_name = 'Manasa1/GPT2_Finetuned_tweets' # Replace with the path to your fine-tuned model
120
-
121
- processor = TweetDatasetProcessor(fine_tuned_model_name, pdf_path)
122
-
123
- text = processor.extract_text_from_pdf()
124
- tweets = processor.process_pdf_content(text)
125
- processor.analyze_personality(max_tweets=50) # Analyze personality, but don't return the result
126
- generated_tweet = processor.generate_tweet(context="AI-powered tweet generation", sample_size=3)
127
-
128
- return generated_tweet # Only return the generated tweet
129
 
130
- # Gradio app setup
131
- iface = gr.Interface(
132
- fn=gradio_interface,
133
- inputs=[],
134
- outputs=gr.Textbox(label="Generated Tweet"), # Only output the generated tweet
135
- live=False, # Set to False to generate only when user clicks the button
136
- title="AI Tweet Generation",
137
- description="Generate tweets based on the personality profile and tweets from a PDF document."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  )
139
 
140
  # Launch the app
141
- if __name__ == "__main__":
142
- iface.launch()
 
4
  from datetime import datetime
5
  from PyPDF2 import PdfReader
6
  import json
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Replace 'username/your_model_name' with your Hugging Face model name
10
+ model_name = "username/your_model_name"
11
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ model = AutoModelForCausalLM.from_pretrained(model_name)
13
+
14
+ def generate_tweet(prompt):
15
+ # Tokenize the input
16
+ inputs = tokenizer(prompt, return_tensors="pt")
17
+
18
+ # Generate text using the model
19
+ outputs = model.generate(
20
+ inputs["input_ids"],
21
+ max_length=280, # Limit tweets to 280 characters
22
+ num_return_sequences=1, # Number of tweets to generate
23
+ top_k=50, # Sampling from top k tokens
24
+ top_p=0.95, # Sampling from top p cumulative probability
25
+ temperature=0.7, # Adjust creativity
26
+ do_sample=True, # Enable sampling
27
+ )
28
+
29
+ # Decode the generated text
30
+ tweet = tokenizer.decode(outputs[0], skip_special_tokens=True)
31
+ return tweet
32
+
33
+ interface = gr.Interface(
34
+ fn=generate_tweet, # The function to call
35
+ inputs="text", # User input is a single text box
36
+ outputs="text", # Output is text
37
+ title="AI Tweet Generator",
38
+ description="Enter a topic or a few words, and the AI will generate a creative tweet!"
39
  )
40
 
41
  # Launch the app
42
+ interface.launch()