Manasa1 commited on
Commit
a6a0895
·
verified ·
1 Parent(s): 074cf17

Update tweet_analyzer.py

Browse files
Files changed (1) hide show
  1. tweet_analyzer.py +15 -13
tweet_analyzer.py CHANGED
@@ -21,6 +21,19 @@ class TweetDatasetProcessor:
21
  self.vectorizer = TfidfVectorizer(stop_words='english')
22
  self.used_tweets = set() # Track used tweets to avoid repetition
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def extract_text_from_pdf(self, pdf_path):
25
  """Extract text content from PDF file."""
26
  reader = PdfReader(pdf_path)
@@ -35,7 +48,8 @@ class TweetDatasetProcessor:
35
  raise ValueError("The uploaded PDF appears to be empty.")
36
 
37
  lines = text.split('\n')
38
- clean_tweets = Parallel(n_jobs=-1)(delayed(self._process_line)(line) for line in lines)
 
39
  self.tweets = [tweet for tweet in clean_tweets if tweet]
40
 
41
  if not self.tweets:
@@ -46,18 +60,6 @@ class TweetDatasetProcessor:
46
  df.to_csv('processed_tweets.csv', index=False)
47
  return df
48
 
49
- def _process_line(self, line):
50
- """Process a single line in parallel."""
51
- line = line.strip()
52
- if not line or line.startswith('http'): # Skip empty lines and URLs
53
- return None
54
- return {
55
- 'content': line,
56
- 'timestamp': datetime.now(),
57
- 'mentions': self._extract_mentions(line),
58
- 'hashtags': self._extract_hashtags(line)
59
- }
60
-
61
  def _extract_mentions(self, text):
62
  """Extract mentioned users from tweet."""
63
  return [word for word in text.split() if word.startswith('@')]
 
21
  self.vectorizer = TfidfVectorizer(stop_words='english')
22
  self.used_tweets = set() # Track used tweets to avoid repetition
23
 
24
+ @staticmethod
25
+ def _process_line(line):
26
+ """Process a single line."""
27
+ line = line.strip()
28
+ if not line or line.startswith('http'): # Skip empty lines and URLs
29
+ return None
30
+ return {
31
+ 'content': line,
32
+ 'timestamp': datetime.now(),
33
+ 'mentions': [word for word in line.split() if word.startswith('@')],
34
+ 'hashtags': [word for word in line.split() if word.startswith('#')]
35
+ }
36
+
37
  def extract_text_from_pdf(self, pdf_path):
38
  """Extract text content from PDF file."""
39
  reader = PdfReader(pdf_path)
 
48
  raise ValueError("The uploaded PDF appears to be empty.")
49
 
50
  lines = text.split('\n')
51
+ # Pass the static method explicitly
52
+ clean_tweets = Parallel(n_jobs=-1)(delayed(TweetDatasetProcessor._process_line)(line) for line in lines)
53
  self.tweets = [tweet for tweet in clean_tweets if tweet]
54
 
55
  if not self.tweets:
 
60
  df.to_csv('processed_tweets.csv', index=False)
61
  return df
62
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def _extract_mentions(self, text):
64
  """Extract mentioned users from tweet."""
65
  return [word for word in text.split() if word.startswith('@')]