Spaces:
Sleeping
Sleeping
Update tweet_analyzer.py
Browse files- tweet_analyzer.py +15 -13
tweet_analyzer.py
CHANGED
@@ -21,6 +21,19 @@ class TweetDatasetProcessor:
|
|
21 |
self.vectorizer = TfidfVectorizer(stop_words='english')
|
22 |
self.used_tweets = set() # Track used tweets to avoid repetition
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
def extract_text_from_pdf(self, pdf_path):
|
25 |
"""Extract text content from PDF file."""
|
26 |
reader = PdfReader(pdf_path)
|
@@ -35,7 +48,8 @@ class TweetDatasetProcessor:
|
|
35 |
raise ValueError("The uploaded PDF appears to be empty.")
|
36 |
|
37 |
lines = text.split('\n')
|
38 |
-
|
|
|
39 |
self.tweets = [tweet for tweet in clean_tweets if tweet]
|
40 |
|
41 |
if not self.tweets:
|
@@ -46,18 +60,6 @@ class TweetDatasetProcessor:
|
|
46 |
df.to_csv('processed_tweets.csv', index=False)
|
47 |
return df
|
48 |
|
49 |
-
def _process_line(self, line):
|
50 |
-
"""Process a single line in parallel."""
|
51 |
-
line = line.strip()
|
52 |
-
if not line or line.startswith('http'): # Skip empty lines and URLs
|
53 |
-
return None
|
54 |
-
return {
|
55 |
-
'content': line,
|
56 |
-
'timestamp': datetime.now(),
|
57 |
-
'mentions': self._extract_mentions(line),
|
58 |
-
'hashtags': self._extract_hashtags(line)
|
59 |
-
}
|
60 |
-
|
61 |
def _extract_mentions(self, text):
|
62 |
"""Extract mentioned users from tweet."""
|
63 |
return [word for word in text.split() if word.startswith('@')]
|
|
|
21 |
self.vectorizer = TfidfVectorizer(stop_words='english')
|
22 |
self.used_tweets = set() # Track used tweets to avoid repetition
|
23 |
|
24 |
+
@staticmethod
|
25 |
+
def _process_line(line):
|
26 |
+
"""Process a single line."""
|
27 |
+
line = line.strip()
|
28 |
+
if not line or line.startswith('http'): # Skip empty lines and URLs
|
29 |
+
return None
|
30 |
+
return {
|
31 |
+
'content': line,
|
32 |
+
'timestamp': datetime.now(),
|
33 |
+
'mentions': [word for word in line.split() if word.startswith('@')],
|
34 |
+
'hashtags': [word for word in line.split() if word.startswith('#')]
|
35 |
+
}
|
36 |
+
|
37 |
def extract_text_from_pdf(self, pdf_path):
|
38 |
"""Extract text content from PDF file."""
|
39 |
reader = PdfReader(pdf_path)
|
|
|
48 |
raise ValueError("The uploaded PDF appears to be empty.")
|
49 |
|
50 |
lines = text.split('\n')
|
51 |
+
# Pass the static method explicitly
|
52 |
+
clean_tweets = Parallel(n_jobs=-1)(delayed(TweetDatasetProcessor._process_line)(line) for line in lines)
|
53 |
self.tweets = [tweet for tweet in clean_tweets if tweet]
|
54 |
|
55 |
if not self.tweets:
|
|
|
60 |
df.to_csv('processed_tweets.csv', index=False)
|
61 |
return df
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
def _extract_mentions(self, text):
|
64 |
"""Extract mentioned users from tweet."""
|
65 |
return [word for word in text.split() if word.startswith('@')]
|