Spaces:

Manasa1
/

Jack_Clone

Sleeping

Manasa1 commited on Nov 22, 2024

Commit

a6a0895

verified ·

1 Parent(s): 074cf17

Update tweet_analyzer.py

Files changed (1) hide show

tweet_analyzer.py CHANGED Viewed

@@ -21,6 +21,19 @@ class TweetDatasetProcessor:
         self.vectorizer = TfidfVectorizer(stop_words='english')
         self.used_tweets = set()  # Track used tweets to avoid repetition
     def extract_text_from_pdf(self, pdf_path):
         """Extract text content from PDF file."""
         reader = PdfReader(pdf_path)
@@ -35,7 +48,8 @@ class TweetDatasetProcessor:
             raise ValueError("The uploaded PDF appears to be empty.")
         lines = text.split('\n')
-        clean_tweets = Parallel(n_jobs=-1)(delayed(self._process_line)(line) for line in lines)
         self.tweets = [tweet for tweet in clean_tweets if tweet]
         if not self.tweets:
@@ -46,18 +60,6 @@ class TweetDatasetProcessor:
         df.to_csv('processed_tweets.csv', index=False)
         return df
-    def _process_line(self, line):
-        """Process a single line in parallel."""
-        line = line.strip()
-        if not line or line.startswith('http'):  # Skip empty lines and URLs
-            return None
-        return {
-            'content': line,
-            'timestamp': datetime.now(),
-            'mentions': self._extract_mentions(line),
-            'hashtags': self._extract_hashtags(line)
-        }
     def _extract_mentions(self, text):
         """Extract mentioned users from tweet."""
         return [word for word in text.split() if word.startswith('@')]

         self.vectorizer = TfidfVectorizer(stop_words='english')
         self.used_tweets = set()  # Track used tweets to avoid repetition
+    @staticmethod
+    def _process_line(line):
+        """Process a single line."""
+        line = line.strip()
+        if not line or line.startswith('http'):  # Skip empty lines and URLs
+            return None
+        return {
+            'content': line,
+            'timestamp': datetime.now(),
+            'mentions': [word for word in line.split() if word.startswith('@')],
+            'hashtags': [word for word in line.split() if word.startswith('#')]
+        }
     def extract_text_from_pdf(self, pdf_path):
         """Extract text content from PDF file."""
         reader = PdfReader(pdf_path)
             raise ValueError("The uploaded PDF appears to be empty.")
         lines = text.split('\n')
+        # Pass the static method explicitly
+        clean_tweets = Parallel(n_jobs=-1)(delayed(TweetDatasetProcessor._process_line)(line) for line in lines)
         self.tweets = [tweet for tweet in clean_tweets if tweet]
         if not self.tweets:
         df.to_csv('processed_tweets.csv', index=False)
         return df
     def _extract_mentions(self, text):
         """Extract mentioned users from tweet."""
         return [word for word in text.split() if word.startswith('@')]