Agrannya commited on
Commit
29f8f72
·
verified ·
1 Parent(s): 4132075

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. deploy (1).py +591 -0
  2. deploy.py +591 -356
deploy (1).py ADDED
@@ -0,0 +1,591 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ import torch
5
+ import emoji
6
+ import re
7
+ import numpy as np
8
+ from collections import Counter
9
+ from instagrapi import Client
10
+ from transformers import (
11
+ pipeline,
12
+ AutoTokenizer,
13
+ AutoModelForSequenceClassification,
14
+ Trainer,
15
+ TrainingArguments,
16
+ RobertaForSequenceClassification,
17
+ AlbertForSequenceClassification
18
+ )
19
+ from datasets import Dataset, Features, Value
20
+ from sklearn.metrics import accuracy_score, f1_score
21
+
22
+ # Configuration
23
+ CONFIG = {
24
+ "max_length": 128,
25
+ "batch_size": 16,
26
+ "learning_rate": 2e-5,
27
+ "num_train_epochs": 3,
28
+ "few_shot_examples": 5,
29
+ "confidence_threshold": 0.7,
30
+ "neutral_reanalysis_threshold": 0.33
31
+ }
32
+
33
+ # Global state
34
+ cl = None
35
+ explore_reels_list = []
36
+ sentiment_analyzer = None
37
+ content_classifier = None
38
+
39
+ # Content categories
40
+ CONTENT_CATEGORIES = [
41
+ "news", "meme", "sports", "science", "music", "movie",
42
+ "gym", "comedy", "food", "technology", "travel", "fashion", "art", "business"
43
+ ]
44
+
45
+ CATEGORY_KEYWORDS = {
46
+ "news": {"news", "update", "breaking", "reported", "headlines"},
47
+ "meme": {"meme", "funny", "lol", "haha", "relatable"},
48
+ "sports": {"sports", "cricket", "football", "match", "game", "team", "score"},
49
+ "science": {"science", "research", "discovery", "experiment", "facts", "theory"},
50
+ "music": {"music", "song", "album", "release", "artist", "beats"},
51
+ "movie": {"movie", "film", "bollywood", "trailer", "series", "actor"},
52
+ "gym": {"gym", "workout", "fitness", "exercise", "training", "bodybuilding"},
53
+ "comedy": {"comedy", "joke", "humor", "standup", "skit", "laugh"},
54
+ "food": {"food", "recipe", "cooking", "eat", "delicious", "restaurant", "kitchen"},
55
+ "technology": {"tech", "phone", "computer", "ai", "gadget", "software", "innovation"},
56
+ "travel": {"travel", "trip", "vacation", "explore", "destination", "adventure"},
57
+ "fashion": {"fashion", "style", "ootd", "outfit", "trends", "clothing"},
58
+ "art": {"art", "artist", "painting", "drawing", "creative", "design"},
59
+ "business": {"business", "startup", "marketing", "money", "finance", "entrepreneur"}
60
+ }
61
+
62
+ class ReelSentimentAnalyzer:
63
+ def __init__(self):
64
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
65
+ self._initialize_models()
66
+ self._setup_emotion_mappings()
67
+
68
+ def _initialize_models(self):
69
+ print("Loading sentiment analysis models...")
70
+ # English models
71
+ self.emotion_tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-emotion-analysis")
72
+ self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
73
+ "finiteautomata/bertweet-base-emotion-analysis"
74
+ ).to(self.device)
75
+
76
+ self.sentiment_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
77
+ self.sentiment_model = RobertaForSequenceClassification.from_pretrained(
78
+ "cardiffnlp/twitter-roberta-base-sentiment-latest",
79
+ ignore_mismatched_sizes=True
80
+ ).to(self.device)
81
+
82
+ # Hindi/English model
83
+ self.hindi_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
84
+ self.hindi_model = AlbertForSequenceClassification.from_pretrained(
85
+ "ai4bharat/indic-bert",
86
+ num_labels=3,
87
+ id2label={0: "negative", 1: "neutral", 2: "positive"},
88
+ label2id={"negative": 0, "neutral": 1, "positive": 2}
89
+ ).to(self.device)
90
+ self.hindi_label2id = self.hindi_model.config.label2id
91
+
92
+ def _setup_emotion_mappings(self):
93
+ self.emotion_map = {
94
+ "joy": "positive", "love": "positive", "happy": "positive",
95
+ "anger": "negative", "sadness": "negative", "fear": "negative",
96
+ "surprise": "neutral", "neutral": "neutral", "disgust": "negative", "shame": "negative"
97
+ }
98
+ self.neutral_keywords = {
99
+ "ad", "sponsored", "promo", "sale", "discount", "offer", "giveaway",
100
+ "buy", "shop", "link in bio",
101
+ "विज्ञापन", "प्रचार", "ऑफर", "डिस्काउंट", "बिक्री", "लिंक बायो में"
102
+ }
103
+
104
+ def train_hindi_model(self, train_data, eval_data=None):
105
+ print("Fine-tuning Hindi sentiment model...")
106
+ train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
107
+
108
+ def map_labels_to_ids(examples):
109
+ labels = []
110
+ for label_str in examples["label"]:
111
+ if label_str in self.hindi_label2id:
112
+ labels.append(self.hindi_label2id[label_str])
113
+ else:
114
+ print(f"Warning: Unexpected label '{label_str}'. Mapping to neutral.")
115
+ labels.append(self.hindi_label2id["neutral"])
116
+ examples["label"] = labels
117
+ return examples
118
+
119
+ train_dataset = train_dataset.map(map_labels_to_ids, batched=True)
120
+ train_dataset = train_dataset.cast_column("label", Value("int64"))
121
+
122
+ def tokenize_function(examples):
123
+ return self.hindi_tokenizer(
124
+ examples["text"],
125
+ padding="max_length",
126
+ truncation=True,
127
+ max_length=CONFIG["max_length"]
128
+ )
129
+
130
+ tokenized_train = train_dataset.map(tokenize_function, batched=True)
131
+
132
+ training_args = TrainingArguments(
133
+ output_dir="./results",
134
+ eval_strategy="epoch" if eval_data else "no",
135
+ per_device_train_batch_size=CONFIG["batch_size"],
136
+ per_device_eval_batch_size=CONFIG["batch_size"],
137
+ learning_rate=CONFIG["learning_rate"],
138
+ num_train_epochs=CONFIG["num_train_epochs"],
139
+ weight_decay=0.01,
140
+ save_strategy="no",
141
+ logging_dir='./logs',
142
+ logging_steps=10,
143
+ report_to="none"
144
+ )
145
+
146
+ def compute_metrics(p):
147
+ predictions, labels = p
148
+ predictions = np.argmax(predictions, axis=1)
149
+ return {
150
+ "accuracy": accuracy_score(labels, predictions),
151
+ "f1": f1_score(labels, predictions, average="weighted")
152
+ }
153
+
154
+ eval_dataset_processed = None
155
+ if eval_data:
156
+ eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))
157
+ eval_dataset = eval_dataset.map(map_labels_to_ids, batched=True)
158
+ eval_dataset_processed = eval_dataset.cast_column("label", Value("int64")).map(tokenize_function, batched=True)
159
+
160
+ trainer = Trainer(
161
+ model=self.hindi_model,
162
+ args=training_args,
163
+ train_dataset=tokenized_train,
164
+ eval_dataset=eval_dataset_processed,
165
+ compute_metrics=compute_metrics if eval_data else None,
166
+ )
167
+
168
+ trainer.train()
169
+ self.hindi_model.save_pretrained("./fine_tuned_hindi_sentiment")
170
+ self.hindi_tokenizer.save_pretrained("./fine_tuned_hindi_sentiment")
171
+
172
+ def preprocess_text(self, text):
173
+ if not text:
174
+ return ""
175
+
176
+ text = emoji.demojize(text, delimiters=(" ", " "))
177
+ text = re.sub(r"http\S+|@\w+", "", text)
178
+
179
+ abbrevs = {
180
+ r"\bomg\b": "oh my god",
181
+ r"\btbh\b": "to be honest",
182
+ r"\bky\b": "kyun",
183
+ r"\bkb\b": "kab",
184
+ r"\bkya\b": "kya",
185
+ r"\bkahan\b": "kahan",
186
+ r"\bkaisa\b": "kaisa"
187
+ }
188
+ for pattern, replacement in abbrevs.items():
189
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
190
+
191
+ return re.sub(r"\s+", " ", text).strip()
192
+
193
+ def detect_language(self, text):
194
+ if re.search(r"[\u0900-\u097F]", text):
195
+ return "hi"
196
+ hinglish_keywords = ["hai", "kyun", "nahi", "kya", "acha", "bas", "yaar", "main"]
197
+ if any(re.search(rf"\b{kw}\b", text.lower()) for kw in hinglish_keywords):
198
+ return "hi-latin"
199
+ return "en"
200
+
201
+ def analyze_content(self, text):
202
+ processed = self.preprocess_text(text)
203
+ if not processed:
204
+ return "neutral", 0.5, {"reason": "empty_text"}
205
+
206
+ lang = self.detect_language(processed)
207
+
208
+ if any(re.search(rf"\b{re.escape(kw)}\b", processed.lower()) for kw in self.neutral_keywords):
209
+ return "neutral", 0.9, {"reason": "neutral_keyword"}
210
+
211
+ try:
212
+ if lang in ("hi", "hi-latin"):
213
+ return self._analyze_hindi_content(processed)
214
+ return self._analyze_english_content(processed)
215
+ except Exception as e:
216
+ print(f"Analysis error: {e}")
217
+ return "neutral", 0.5, {"error": str(e), "original_text": text[:50]}
218
+
219
+ def _analyze_hindi_content(self, text):
220
+ inputs = self.hindi_tokenizer(
221
+ text,
222
+ return_tensors="pt",
223
+ truncation=True,
224
+ padding=True,
225
+ max_length=CONFIG["max_length"]
226
+ ).to(self.device)
227
+
228
+ with torch.no_grad():
229
+ outputs = self.hindi_model(**inputs)
230
+
231
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
232
+ pred_idx = torch.argmax(probs).item()
233
+ confidence = probs[0][pred_idx].item()
234
+ label = self.hindi_model.config.id2label[pred_idx]
235
+ return label, confidence, {"model": "fine-tuned-indic-bert", "lang": "hi"}
236
+
237
+ def _analyze_english_content(self, text):
238
+ # Emotion analysis
239
+ emotion_inputs = self.emotion_tokenizer(
240
+ text,
241
+ return_tensors="pt",
242
+ truncation=True,
243
+ max_length=CONFIG["max_length"]
244
+ ).to(self.device)
245
+
246
+ with torch.no_grad():
247
+ emotion_outputs = self.emotion_model(**emotion_inputs)
248
+
249
+ emotion_probs = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
250
+ emotion_pred = torch.argmax(emotion_probs).item()
251
+ emotion_label = self.emotion_model.config.id2label[emotion_pred]
252
+ emotion_score = emotion_probs[0][emotion_pred].item()
253
+
254
+ # Sentiment analysis
255
+ sentiment_inputs = self.sentiment_tokenizer(
256
+ text,
257
+ return_tensors="pt",
258
+ truncation=True,
259
+ max_length=CONFIG["max_length"]
260
+ ).to(self.device)
261
+
262
+ with torch.no_grad():
263
+ sentiment_outputs = self.sentiment_model(**sentiment_inputs)
264
+
265
+ sentiment_probs = torch.nn.functional.softmax(sentiment_outputs.logits, dim=-1)
266
+ sentiment_pred = torch.argmax(sentiment_probs).item()
267
+ sentiment_label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
268
+ sentiment_label = sentiment_label_mapping.get(sentiment_pred, 'neutral')
269
+ sentiment_score = sentiment_probs[0][sentiment_pred].item()
270
+
271
+ # Combine results
272
+ mapped_emotion = self.emotion_map.get(emotion_label, "neutral")
273
+
274
+ if sentiment_score > CONFIG["confidence_threshold"]:
275
+ final_label = sentiment_label
276
+ final_confidence = sentiment_score
277
+ reason = "high_sentiment_confidence"
278
+ elif emotion_score > CONFIG["confidence_threshold"] and mapped_emotion != "neutral":
279
+ final_label = mapped_emotion
280
+ final_confidence = emotion_score
281
+ reason = "high_emotion_confidence"
282
+ else:
283
+ if sentiment_label == mapped_emotion and sentiment_label != "neutral":
284
+ final_label = sentiment_label
285
+ final_confidence = (sentiment_score + emotion_score) / 2
286
+ reason = "emotion_sentiment_agreement"
287
+ elif sentiment_label != "neutral" and sentiment_score > emotion_score and sentiment_score > 0.4:
288
+ final_label = sentiment_label
289
+ final_confidence = sentiment_score * 0.9
290
+ reason = "sentiment_slightly_higher"
291
+ elif mapped_emotion != "neutral" and emotion_score > sentiment_score and emotion_score > 0.4:
292
+ final_label = mapped_emotion
293
+ final_confidence = emotion_score * 0.9
294
+ reason = "emotion_slightly_higher"
295
+ else:
296
+ final_label = "neutral"
297
+ final_confidence = 0.6
298
+ reason = "fallback_to_neutral"
299
+
300
+ return final_label, final_confidence, {
301
+ "emotion_label": emotion_label,
302
+ "emotion_score": emotion_score,
303
+ "sentiment_label": sentiment_label,
304
+ "sentiment_score": sentiment_score,
305
+ "mapped_emotion": mapped_emotion,
306
+ "model": "ensemble",
307
+ "lang": "en",
308
+ "reason": reason
309
+ }
310
+
311
+ def analyze_reels(self, reels, max_to_analyze=100):
312
+ print(f"Analyzing {max_to_analyze} reels...")
313
+ results = Counter()
314
+ detailed_results = []
315
+
316
+ for i, reel in enumerate(reels[:max_to_analyze], 1):
317
+ caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
318
+ label, confidence, details = self.analyze_content(caption)
319
+ results[label] += 1
320
+ detailed_results.append({
321
+ "reel_id": reel.id,
322
+ "text": caption,
323
+ "label": label,
324
+ "confidence": confidence,
325
+ "details": details
326
+ })
327
+
328
+ if sum(results.values()) > 0 and results["neutral"] / sum(results.values()) > CONFIG["neutral_reanalysis_threshold"]:
329
+ self._reduce_neutrals(results, detailed_results)
330
+
331
+ return results, detailed_results
332
+
333
+ def _reduce_neutrals(self, results, detailed_results):
334
+ neutrals_to_recheck = [item for item in detailed_results if item["label"] == "neutral" and item["confidence"] < 0.8]
335
+
336
+ for item in neutrals_to_recheck:
337
+ text_lower = self.preprocess_text(item["text"]).lower()
338
+ pos_keywords = {"amazing", "love", "best", "fantastic", "awesome", "superb", "great"}
339
+ neg_keywords = {"hate", "worst", "bad", "terrible", "awful", "disappointed", "horrible", "cringe"}
340
+
341
+ is_strong_pos = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in pos_keywords)
342
+ is_strong_neg = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in neg_keywords)
343
+
344
+ if is_strong_pos and not is_strong_neg:
345
+ results["neutral"] -= 1
346
+ results["positive"] += 1
347
+ item.update({
348
+ "label": "positive",
349
+ "confidence": min(0.95, item["confidence"] + 0.3),
350
+ "reanalyzed": True,
351
+ "reanalysis_reason": "strong_pos_keywords"
352
+ })
353
+ elif is_strong_neg and not is_strong_pos:
354
+ results["neutral"] -= 1
355
+ results["negative"] += 1
356
+ item.update({
357
+ "label": "negative",
358
+ "confidence": min(0.95, item["confidence"] + 0.3),
359
+ "reanalyzed": True,
360
+ "reanalysis_reason": "strong_neg_keywords"
361
+ })
362
+
363
+ def plot_sentiment_pie(results, title="Reels Sentiment Analysis"):
364
+ sizes = [results.get('positive', 0), results.get('neutral', 0), results.get('negative', 0)]
365
+ if sum(sizes) == 0:
366
+ return None
367
+
368
+ labels = ['Positive', 'Neutral', 'Negative']
369
+ colors = ['#4CAF50', '#FFC107', '#F44336']
370
+ explode = (0.05, 0, 0.05)
371
+
372
+ fig, ax = plt.subplots(figsize=(8, 6))
373
+ ax.pie(sizes, explode=explode, labels=labels, colors=colors,
374
+ autopct='%1.1f%%', shadow=True, startangle=140,
375
+ textprops={'fontsize': 12, 'color': 'black'})
376
+ ax.axis('equal')
377
+ plt.title(title, fontsize=16, pad=20)
378
+ plt.tight_layout()
379
+ return fig
380
+
381
+ def plot_category_distribution(counter, title="Reels Content Distribution"):
382
+ total = sum(counter.values())
383
+ if total == 0:
384
+ return None
385
+
386
+ threshold = total * 0.02
387
+ other_count = 0
388
+ labels = []
389
+ sizes = []
390
+
391
+ for category, count in counter.most_common():
392
+ if count >= threshold and category != "other":
393
+ labels.append(category.replace('_', ' ').title())
394
+ sizes.append(count)
395
+ else:
396
+ other_count += count
397
+
398
+ if other_count > 0:
399
+ labels.append("Other")
400
+ sizes.append(other_count)
401
+
402
+ if not sizes:
403
+ return None
404
+
405
+ fig, ax = plt.subplots(figsize=(10, 8))
406
+ colors = plt.cm.viridis(np.linspace(0, 1, len(sizes)))
407
+ ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors,
408
+ wedgeprops={'edgecolor': 'white', 'linewidth': 1}, textprops={'fontsize': 11})
409
+ plt.title(title, pad=20, fontsize=15)
410
+ plt.axis('equal')
411
+ plt.tight_layout()
412
+ return fig
413
+
414
+ def preprocess_text_cat(text):
415
+ if not text:
416
+ return ""
417
+ text = re.sub(r"http\S+|@\w+|#\w+", "", text).lower()
418
+ return re.sub(r"\s+", " ", text).strip()
419
+
420
+ def classify_reel_content(text):
421
+ global content_classifier
422
+
423
+ processed = preprocess_text_cat(text)
424
+ if not processed or len(processed.split()) < 2:
425
+ return "other", {"reason": "short_text"}
426
+
427
+ for category, keywords in CATEGORY_KEYWORDS.items():
428
+ if any(re.search(rf"\b{re.escape(keyword)}\b", processed) for keyword in keywords):
429
+ return category, {"reason": "keyword_match"}
430
+
431
+ if content_classifier is None:
432
+ return "other", {"reason": "classifier_not_initialized"}
433
+
434
+ try:
435
+ result = content_classifier(processed[:256], CONTENT_CATEGORIES, multi_label=False)
436
+ top_label = result['labels'][0]
437
+ top_score = result['scores'][0]
438
+ return top_label if top_score > 0.5 else "other", {"reason": "model_prediction", "score": top_score}
439
+ except Exception as e:
440
+ print(f"Classification error: {e}")
441
+ return "other", {"reason": "classification_error"}
442
+
443
+ # Gradio Interface Functions
444
+ def login_gradio_auto():
445
+ global cl
446
+ try:
447
+ PASSWORD = "qwerty@desk" # Replace with your actual password
448
+ except Exception as e:
449
+ return f"Error accessing password: {e}", gr.update(visible=False)
450
+
451
+ if not PASSWORD:
452
+ return "Error: Instagram password not found.", gr.update(visible=False)
453
+
454
+ cl = Client()
455
+ try:
456
+ cl.login("jattman1993", PASSWORD)
457
+ return f"Successfully logged in as jattman1993", gr.update(visible=False)
458
+ except Exception as e:
459
+ cl = None
460
+ error_message = str(e)
461
+ if "Two factor challenged" in error_message or "challenge_required" in error_message:
462
+ return f"Login failed: Two-factor authentication required.", gr.update(visible=True)
463
+ return f"Error during login: {error_message}", gr.update(visible=False)
464
+
465
+ def submit_otp_gradio(otp_code):
466
+ global cl
467
+ if cl is None:
468
+ return "Error: Not logged in.", "", gr.update(visible=False)
469
+
470
+ try:
471
+ cl.two_factor_login(otp_code)
472
+ return f"OTP successful. Logged in as jattman1993.", "", gr.update(visible=False)
473
+ except Exception as e:
474
+ return f"OTP failed: {e}", "", gr.update(visible=True)
475
+
476
+ def fetch_reels_gradio():
477
+ global cl, explore_reels_list
478
+ if cl is None:
479
+ explore_reels_list = []
480
+ return "Error: Not logged in."
481
+
482
+ try:
483
+ explore_reels_list = cl.explore_reels()[:100]
484
+ return f"Fetched {len(explore_reels_list)} reels."
485
+ except Exception as e:
486
+ explore_reels_list = []
487
+ return f"Error fetching reels: {e}"
488
+
489
+ def analyze_reels_gradio(max_to_analyze):
490
+ global explore_reels_list, sentiment_analyzer, content_classifier
491
+
492
+ if not explore_reels_list:
493
+ return "Error: No reels fetched.", None, None
494
+
495
+ num_reels = min(max_to_analyze, len(explore_reels_list))
496
+ reels_to_analyze = explore_reels_list[:num_reels]
497
+
498
+ if sentiment_analyzer is None:
499
+ sentiment_analyzer = ReelSentimentAnalyzer()
500
+
501
+ if content_classifier is None:
502
+ content_classifier = pipeline(
503
+ "zero-shot-classification",
504
+ model="facebook/bart-large-mnli",
505
+ device=0 if torch.cuda.is_available() else -1
506
+ )
507
+
508
+ status_messages = []
509
+ sentiment_plot = None
510
+ content_plot = None
511
+
512
+ # Sentiment Analysis
513
+ try:
514
+ sentiment_results, _ = sentiment_analyzer.analyze_reels(reels_to_analyze)
515
+ sentiment_plot = plot_sentiment_pie(sentiment_results)
516
+ status_messages.append("Sentiment analysis complete.")
517
+ except Exception as e:
518
+ status_messages.append(f"Sentiment error: {e}")
519
+
520
+ # Content Analysis
521
+ try:
522
+ category_counts = Counter()
523
+ for reel in reels_to_analyze:
524
+ caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
525
+ category, _ = classify_reel_content(caption)
526
+ category_counts[category] += 1
527
+ content_plot = plot_category_distribution(category_counts)
528
+ status_messages.append("Content analysis complete.")
529
+ except Exception as e:
530
+ status_messages.append(f"Content error: {e}")
531
+
532
+ return "\n".join(status_messages), sentiment_plot, content_plot
533
+
534
+ # Gradio Interface
535
+ with gr.Blocks() as demo:
536
+ gr.Markdown("# Instagram Reels Analysis")
537
+
538
+ # Login Section
539
+ with gr.Row():
540
+ connect_btn = gr.Button("Connect Instagram")
541
+ login_status = gr.Label(label="Login Status")
542
+
543
+ # OTP Input (hidden initially)
544
+ with gr.Row(visible=False) as otp_row:
545
+ otp_input = gr.Textbox(label="Enter OTP Code")
546
+ otp_submit_btn = gr.Button("Submit OTP")
547
+
548
+ # Fetch Section
549
+ with gr.Row():
550
+ fetch_btn = gr.Button("Fetch Reels")
551
+ fetch_status = gr.Label(label="Fetch Status")
552
+
553
+ # Analysis Section
554
+ with gr.Row():
555
+ max_reels = gr.Slider(1, 100, value=10, step=1, label="Number of Reels to Analyze")
556
+ analyze_btn = gr.Button("Analyze Reels")
557
+ analyze_status = gr.Label(label="Analysis Status")
558
+
559
+ # Results Section
560
+ with gr.Row():
561
+ with gr.Column():
562
+ gr.Markdown("## Sentiment Analysis")
563
+ sentiment_output = gr.Plot(label="Sentiment Distribution")
564
+ with gr.Column():
565
+ gr.Markdown("## Content Analysis")
566
+ content_output = gr.Plot(label="Content Distribution")
567
+
568
+ # Event handlers
569
+ connect_btn.click(
570
+ login_gradio_auto,
571
+ inputs=None,
572
+ outputs=[login_status, otp_row]
573
+ )
574
+ otp_submit_btn.click(
575
+ submit_otp_gradio,
576
+ inputs=otp_input,
577
+ outputs=[login_status, otp_input, otp_row]
578
+ )
579
+ fetch_btn.click(
580
+ fetch_reels_gradio,
581
+ inputs=None,
582
+ outputs=fetch_status
583
+ )
584
+ analyze_btn.click(
585
+ analyze_reels_gradio,
586
+ inputs=max_reels,
587
+ outputs=[analyze_status, sentiment_output, content_output]
588
+ )
589
+
590
+ if __name__ == "__main__":
591
+ demo.launch()
deploy.py CHANGED
@@ -1,356 +1,591 @@
1
-
2
- def plot_sentiment_pie(results, title="Reels Sentiment Analysis"):
3
- """
4
- Creates a pie chart from sentiment analysis results and returns the matplotlib figure.
5
-
6
- Args:
7
- results: Counter object or dict with 'positive', 'neutral', 'negative' keys
8
- title: Chart title
9
-
10
- Returns:
11
- Matplotlib Figure object, or None if no data.
12
- """
13
- labels = ['Positive', 'Neutral', 'Negative']
14
- sizes = [results.get('positive', 0), results.get('neutral', 0), results.get('negative', 0)]
15
-
16
- if sum(sizes) == 0:
17
- return None
18
-
19
- colors = ['#4CAF50', '#FFC107', '#F44336']
20
- explode = (0.05, 0, 0.05)
21
-
22
- fig, ax = plt.subplots(figsize=(8, 6))
23
-
24
- filtered_labels = [label for i, label in enumerate(labels) if sizes[i] > 0]
25
- filtered_sizes = [size for size in sizes if size > 0]
26
- filtered_colors = [colors[i] for i, size in enumerate(sizes) if size > 0]
27
- explode_map = {'Positive': 0.05, 'Neutral': 0, 'Negative': 0.05}
28
- filtered_explode = [explode_map.get(label, 0) for label in filtered_labels]
29
-
30
- ax.pie(filtered_sizes, explode=filtered_explode, labels=filtered_labels, colors=filtered_colors,
31
- autopct='%1.1f%%', shadow=True, startangle=140,
32
- textprops={'fontsize': 12, 'color': 'black'})
33
-
34
- ax.axis('equal')
35
- plt.title(title, fontsize=16, pad=20)
36
- plt.tight_layout()
37
-
38
- # Return the figure object instead of saving to bytes
39
- return fig
40
-
41
- def plot_category_distribution(counter, title="Reels Content Distribution"):
42
- """
43
- Generate pie chart from category counts and returns the matplotlib figure.
44
-
45
- Args:
46
- counter: Counter object with category counts.
47
- title: Chart title.
48
-
49
- Returns:
50
- Matplotlib Figure object, or None if no data.
51
- """
52
- labels = []
53
- sizes = []
54
-
55
- total = sum(counter.values())
56
- if total == 0:
57
- return None
58
-
59
- threshold = total * 0.02
60
- other_count = 0
61
-
62
- sorted_categories = counter.most_common()
63
-
64
- for category, count in sorted_categories:
65
- if count >= threshold and category != "other":
66
- labels.append(category.replace('_', ' ').title())
67
- sizes.append(count)
68
- elif category == "other":
69
- other_count += count
70
- else:
71
- other_count += count
72
-
73
- if other_count > 0:
74
- labels.append("Other")
75
- sizes.append(other_count)
76
-
77
- if not sizes:
78
- return None
79
-
80
- fig, ax = plt.subplots(figsize=(10, 8))
81
- colors = plt.cm.viridis(np.linspace(0, 1, len(sizes)))
82
-
83
- ax.pie(
84
- sizes,
85
- labels=labels,
86
- autopct='%1.1f%%',
87
- startangle=140,
88
- colors=colors,
89
- wedgeprops={'edgecolor': 'white', 'linewidth': 1},
90
- textprops={'fontsize': 11, 'color': 'black'}
91
- )
92
-
93
- plt.title(title, pad=20, fontsize=15)
94
- plt.axis('equal')
95
- plt.tight_layout()
96
-
97
- # Return the figure object instead of saving to bytes
98
- return fig
99
-
100
- # The rest of the Gradio Blocks interface definition and function linking
101
- # should remain the same, as the analyze_reels_gradio function already
102
- # calls these plotting functions and is intended to return the figure objects now.
103
-
104
- # Global variables to maintain state across Gradio calls
105
- global cl
106
- global explore_reels_list
107
- global sentiment_analyzer_instance
108
- global content_classifier_pipeline
109
-
110
- # Initialize sentiment analyzer if not already done (can be done here or lazily in analyze_reels_gradio)
111
- # Doing it here ensures the model is loaded when this cell runs, potentially reducing latency on first analyze click.
112
- try:
113
- sentiment_analyzer_instance = ReelSentimentAnalyzer()
114
- print("Sentiment Analyzer initialized.")
115
- # Optional: Train Hindi model if needed and data is available
116
- # sample_train_data = [...] # Define your training data
117
- # sentiment_analyzer_instance.train_hindi_model(sample_train_data)
118
- except Exception as e:
119
- print(f"Error initializing Sentiment Analyzer globally: {e}")
120
- sentiment_analyzer_instance = None
121
-
122
-
123
- # Initialize content classifier pipeline if not already done (can be done here or lazily)
124
- try:
125
- print("Initializing Content Classifier Pipeline globally...")
126
- content_classifier_pipeline = pipeline(
127
- "zero-shot-classification",
128
- model="facebook/bart-large-mnli",
129
- device=0 if torch.cuda.is_available() else -1 # Use GPU if available
130
- )
131
- print("Content Classifier Pipeline Initialized.")
132
- except Exception as e:
133
- print(f"Error initializing Content Classifier globally: {e}")
134
- content_classifier_pipeline = None
135
-
136
-
137
- def login_gradio(username):
138
- """Gradio-compatible login function."""
139
- global cl
140
- try:
141
- PASSWORD = userdata.get('password')
142
- except Exception as e:
143
- return f"Error accessing password secret: {e}"
144
-
145
-
146
- if not PASSWORD:
147
- return "Error: Instagram password not found in Colab secrets."
148
-
149
- cl = Client()
150
-
151
- try:
152
- cl.login(username, PASSWORD)
153
- return f"Successfully logged in as {username}"
154
- except Exception as e:
155
- cl = None # Ensure cl is None on failure
156
- return f"Error during login: {e}"
157
-
158
- def fetch_reels_gradio():
159
- """Gradio-compatible function to fetch explore reels."""
160
- global cl
161
- global explore_reels_list
162
-
163
- if cl is None:
164
- explore_reels_list = [] # Ensure list is empty on failure
165
- return "Error: Not logged in. Please log in first."
166
-
167
- try:
168
- # Fetch a limited number of reels for demonstration purposes
169
- # You might want to make this number configurable later
170
- fetched_reels = cl.explore_reels()[:100] # Fetch up to 100 for analysis
171
- explore_reels_list = fetched_reels
172
- if explore_reels_list:
173
- return f"Successfully fetched {len(explore_reels_list)} explore reels."
174
- else:
175
- explore_reels_list = [] # Ensure it's an empty list
176
- return "Fetched 0 explore reels."
177
- except Exception as e:
178
- explore_reels_list = [] # Ensure it's an empty list on error
179
- return f"Error fetching explore reels: {e}"
180
-
181
-
182
- def analyze_reels_gradio(max_to_analyze):
183
- """Gradio-compatible function to analyze fetched reels and generate plots."""
184
- global explore_reels_list
185
- global sentiment_analyzer_instance
186
- global content_classifier_pipeline
187
-
188
- if not explore_reels_list:
189
- # Return None for plots if no reels
190
- return "Error: No reels fetched yet. Please fetch reels first.", None, None
191
-
192
- # Ensure max_to_analyze does not exceed the number of fetched reels
193
- num_reels_to_process = min(max_to_analyze, len(explore_reels_list))
194
- reels_to_analyze = explore_reels_list[:num_reels_to_process]
195
-
196
- if not reels_to_analyze:
197
- return "Error: No reels available to analyze.", None, None
198
-
199
-
200
- # Check if analyzers are initialized
201
- if sentiment_analyzer_instance is None:
202
- return "Error: Sentiment Analyzer not initialized.", None, None
203
- if content_classifier_pipeline is None:
204
- return "Error: Content Classifier not initialized.", None, None
205
-
206
-
207
- analysis_status_messages = []
208
- sentiment_plot_figure = None # Changed to figure
209
- content_plot_figure = None # Changed to figure
210
-
211
- # Perform Sentiment Analysis
212
- try:
213
- analysis_status_messages.append(f"Starting Sentiment Analysis for {len(reels_to_analyze)} reels...")
214
- sentiment_results, detailed_sentiment_results = sentiment_analyzer_instance.analyze_reels(
215
- reels_to_analyze,
216
- max_to_analyze=len(reels_to_analyze) # Pass the actual number being processed
217
- )
218
- # Call the updated plotting function that returns a figure
219
- sentiment_plot_figure = plot_sentiment_pie(sentiment_results, title=f"Sentiment of {len(reels_to_analyze)} Instagram Reels")
220
- analysis_status_messages.append("Sentiment Analysis Complete.")
221
- except Exception as e:
222
- analysis_status_messages.append(f"Error during Sentiment Analysis: {e}")
223
- sentiment_plot_figure = None # Ensure plot is None on error
224
-
225
-
226
- # Perform Content Categorization
227
- try:
228
- analysis_status_messages.append(f"Starting Content Categorization for {len(reels_to_analyze)} reels...")
229
- category_counts = Counter()
230
- # Re-implement content analysis slightly to fit this flow using the global pipeline
231
- print(f"\n⏳ Analyzing content for {len(reels_to_analyze)} reels...")
232
- for i, reel in enumerate(reels_to_analyze, 1):
233
- caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
234
- # Use the global classifier pipeline
235
- category, details = classify_reel_content(caption)
236
- category_counts[category] += 1
237
-
238
- print("\n✅ Content Analysis complete!")
239
- print("\n📊 Category Counts:")
240
- for category, count in category_counts.most_common():
241
- print(f"- {category.replace('_', ' ').title()}: {count}")
242
-
243
- # Call the updated plotting function that returns a figure
244
- content_plot_figure = plot_category_distribution(category_counts)
245
- analysis_status_messages.append("Content Categorization Complete.")
246
-
247
- except Exception as e:
248
- analysis_status_messages.append(f"Error during Content Analysis: {e}")
249
- content_plot_figure = None # Ensure plot is None on error
250
-
251
-
252
- final_status_message = "\n".join(analysis_status_messages)
253
- # Return the figure objects
254
- return final_status_message, sentiment_plot_figure, content_plot_figure
255
-
256
- # --- Gradio Blocks Interface ---
257
- with gr.Blocks() as demo:
258
- gr.Markdown("# Instagram Reels Analysis")
259
- with gr.Row():
260
- username_input = gr.Textbox(label="Instagram Username")
261
- login_button = gr.Button("Login")
262
- login_status_output = gr.Label(label="Login Status")
263
-
264
- with gr.Row():
265
- fetch_button = gr.Button("Fetch Reels")
266
- fetch_status_output = gr.Label(label="Fetch Status")
267
-
268
- with gr.Row():
269
- max_reels_input = gr.Slider(minimum=1, maximum=100, value=10, step=1, label="Number of Reels to Analyze")
270
- analyze_button = gr.Button("Analyze Reels")
271
-
272
- analyze_status_output = gr.Label(label="Analysis Status")
273
-
274
- with gr.Row():
275
- # Sentiment Analysis Outputs
276
- with gr.Column():
277
- gr.Markdown("## Sentiment Analysis")
278
- sentiment_plot_output = gr.Plot(label="Sentiment Distribution")
279
-
280
- # Content Analysis Outputs
281
- with gr.Column():
282
- gr.Markdown("## Content Analysis")
283
- content_plot_output = gr.Plot(label="Content Distribution")
284
-
285
-
286
- # Link login and fetch buttons (assuming login_gradio and fetch_reels_gradio are defined)
287
- # Redefine login_gradio and fetch_reels_gradio here within the Blocks context
288
- # to ensure they are linked correctly, even though they were defined above.
289
- # This is a common pattern in Gradio Blocks.
290
-
291
- def login_gradio_blocks(username):
292
- """Gradio-compatible login function for Blocks."""
293
- global cl
294
- try:
295
- PASSWORD = userdata.get('password')
296
- except Exception as e:
297
- return f"Error accessing password secret: {e}"
298
-
299
-
300
- if not PASSWORD:
301
- return "Error: Instagram password not found in Colab secrets."
302
-
303
- cl = Client()
304
-
305
- try:
306
- cl.login(username, PASSWORD)
307
- return f"Successfully logged in as {username}"
308
- except Exception as e:
309
- cl = None # Ensure cl is None on failure
310
- return f"Error during login: {e}"
311
-
312
- def fetch_reels_gradio_blocks():
313
- """Gradio-compatible function to fetch explore reels for Blocks."""
314
- global cl
315
- global explore_reels_list
316
-
317
- if cl is None:
318
- explore_reels_list = [] # Ensure list is empty on failure
319
- return "Error: Not logged in. Please log in first."
320
-
321
- try:
322
- # Fetch a limited number of reels for demonstration purposes
323
- # You might want to make this number configurable later
324
- fetched_reels = cl.explore_reels()[:100] # Fetch up to 100 for analysis
325
- explore_reels_list = fetched_reels
326
- if explore_reels_list:
327
- return f"Successfully fetched {len(explore_reels_list)} explore reels."
328
- else:
329
- explore_reels_list = [] # Ensure it's an empty list
330
- return "Fetched 0 explore reels."
331
- except Exception as e:
332
- explore_reels_list = [] # Ensure it's an empty list on error
333
- return f"Error fetching explore reels: {e}"
334
-
335
-
336
- login_button.click(
337
- fn=login_gradio_blocks,
338
- inputs=username_input,
339
- outputs=login_status_output
340
- )
341
-
342
- fetch_button.click(
343
- fn=fetch_reels_gradio_blocks,
344
- inputs=None, # No direct inputs needed for fetching
345
- outputs=fetch_status_output
346
- )
347
-
348
- # Link the Analyze button to the analysis function
349
- analyze_button.click(
350
- fn=analyze_reels_gradio,
351
- inputs=max_reels_input, # Input is the slider value
352
- outputs=[analyze_status_output, sentiment_plot_output, content_plot_output] # Outputs are status and the two plots
353
- )
354
-
355
- # The demo is now fully defined. It can be launched in the next step.
356
- # demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ import torch
5
+ import emoji
6
+ import re
7
+ import numpy as np
8
+ from collections import Counter
9
+ from instagrapi import Client
10
+ from transformers import (
11
+ pipeline,
12
+ AutoTokenizer,
13
+ AutoModelForSequenceClassification,
14
+ Trainer,
15
+ TrainingArguments,
16
+ RobertaForSequenceClassification,
17
+ AlbertForSequenceClassification
18
+ )
19
+ from datasets import Dataset, Features, Value
20
+ from sklearn.metrics import accuracy_score, f1_score
21
+
22
+ # Configuration
23
+ CONFIG = {
24
+ "max_length": 128,
25
+ "batch_size": 16,
26
+ "learning_rate": 2e-5,
27
+ "num_train_epochs": 3,
28
+ "few_shot_examples": 5,
29
+ "confidence_threshold": 0.7,
30
+ "neutral_reanalysis_threshold": 0.33
31
+ }
32
+
33
+ # Global state
34
+ cl = None
35
+ explore_reels_list = []
36
+ sentiment_analyzer = None
37
+ content_classifier = None
38
+
39
+ # Content categories
40
+ CONTENT_CATEGORIES = [
41
+ "news", "meme", "sports", "science", "music", "movie",
42
+ "gym", "comedy", "food", "technology", "travel", "fashion", "art", "business"
43
+ ]
44
+
45
+ CATEGORY_KEYWORDS = {
46
+ "news": {"news", "update", "breaking", "reported", "headlines"},
47
+ "meme": {"meme", "funny", "lol", "haha", "relatable"},
48
+ "sports": {"sports", "cricket", "football", "match", "game", "team", "score"},
49
+ "science": {"science", "research", "discovery", "experiment", "facts", "theory"},
50
+ "music": {"music", "song", "album", "release", "artist", "beats"},
51
+ "movie": {"movie", "film", "bollywood", "trailer", "series", "actor"},
52
+ "gym": {"gym", "workout", "fitness", "exercise", "training", "bodybuilding"},
53
+ "comedy": {"comedy", "joke", "humor", "standup", "skit", "laugh"},
54
+ "food": {"food", "recipe", "cooking", "eat", "delicious", "restaurant", "kitchen"},
55
+ "technology": {"tech", "phone", "computer", "ai", "gadget", "software", "innovation"},
56
+ "travel": {"travel", "trip", "vacation", "explore", "destination", "adventure"},
57
+ "fashion": {"fashion", "style", "ootd", "outfit", "trends", "clothing"},
58
+ "art": {"art", "artist", "painting", "drawing", "creative", "design"},
59
+ "business": {"business", "startup", "marketing", "money", "finance", "entrepreneur"}
60
+ }
61
+
62
+ class ReelSentimentAnalyzer:
63
+ def __init__(self):
64
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
65
+ self._initialize_models()
66
+ self._setup_emotion_mappings()
67
+
68
+ def _initialize_models(self):
69
+ print("Loading sentiment analysis models...")
70
+ # English models
71
+ self.emotion_tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-emotion-analysis")
72
+ self.emotion_model = AutoModelForSequenceClassification.from_pretrained(
73
+ "finiteautomata/bertweet-base-emotion-analysis"
74
+ ).to(self.device)
75
+
76
+ self.sentiment_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
77
+ self.sentiment_model = RobertaForSequenceClassification.from_pretrained(
78
+ "cardiffnlp/twitter-roberta-base-sentiment-latest",
79
+ ignore_mismatched_sizes=True
80
+ ).to(self.device)
81
+
82
+ # Hindi/English model
83
+ self.hindi_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
84
+ self.hindi_model = AlbertForSequenceClassification.from_pretrained(
85
+ "ai4bharat/indic-bert",
86
+ num_labels=3,
87
+ id2label={0: "negative", 1: "neutral", 2: "positive"},
88
+ label2id={"negative": 0, "neutral": 1, "positive": 2}
89
+ ).to(self.device)
90
+ self.hindi_label2id = self.hindi_model.config.label2id
91
+
92
+ def _setup_emotion_mappings(self):
93
+ self.emotion_map = {
94
+ "joy": "positive", "love": "positive", "happy": "positive",
95
+ "anger": "negative", "sadness": "negative", "fear": "negative",
96
+ "surprise": "neutral", "neutral": "neutral", "disgust": "negative", "shame": "negative"
97
+ }
98
+ self.neutral_keywords = {
99
+ "ad", "sponsored", "promo", "sale", "discount", "offer", "giveaway",
100
+ "buy", "shop", "link in bio",
101
+ "विज्ञापन", "प्रचार", "ऑफर", "डिस्काउंट", "बिक्री", "लिंक बायो में"
102
+ }
103
+
104
+ def train_hindi_model(self, train_data, eval_data=None):
105
+ print("Fine-tuning Hindi sentiment model...")
106
+ train_dataset = Dataset.from_pandas(pd.DataFrame(train_data))
107
+
108
+ def map_labels_to_ids(examples):
109
+ labels = []
110
+ for label_str in examples["label"]:
111
+ if label_str in self.hindi_label2id:
112
+ labels.append(self.hindi_label2id[label_str])
113
+ else:
114
+ print(f"Warning: Unexpected label '{label_str}'. Mapping to neutral.")
115
+ labels.append(self.hindi_label2id["neutral"])
116
+ examples["label"] = labels
117
+ return examples
118
+
119
+ train_dataset = train_dataset.map(map_labels_to_ids, batched=True)
120
+ train_dataset = train_dataset.cast_column("label", Value("int64"))
121
+
122
+ def tokenize_function(examples):
123
+ return self.hindi_tokenizer(
124
+ examples["text"],
125
+ padding="max_length",
126
+ truncation=True,
127
+ max_length=CONFIG["max_length"]
128
+ )
129
+
130
+ tokenized_train = train_dataset.map(tokenize_function, batched=True)
131
+
132
+ training_args = TrainingArguments(
133
+ output_dir="./results",
134
+ eval_strategy="epoch" if eval_data else "no",
135
+ per_device_train_batch_size=CONFIG["batch_size"],
136
+ per_device_eval_batch_size=CONFIG["batch_size"],
137
+ learning_rate=CONFIG["learning_rate"],
138
+ num_train_epochs=CONFIG["num_train_epochs"],
139
+ weight_decay=0.01,
140
+ save_strategy="no",
141
+ logging_dir='./logs',
142
+ logging_steps=10,
143
+ report_to="none"
144
+ )
145
+
146
+ def compute_metrics(p):
147
+ predictions, labels = p
148
+ predictions = np.argmax(predictions, axis=1)
149
+ return {
150
+ "accuracy": accuracy_score(labels, predictions),
151
+ "f1": f1_score(labels, predictions, average="weighted")
152
+ }
153
+
154
+ eval_dataset_processed = None
155
+ if eval_data:
156
+ eval_dataset = Dataset.from_pandas(pd.DataFrame(eval_data))
157
+ eval_dataset = eval_dataset.map(map_labels_to_ids, batched=True)
158
+ eval_dataset_processed = eval_dataset.cast_column("label", Value("int64")).map(tokenize_function, batched=True)
159
+
160
+ trainer = Trainer(
161
+ model=self.hindi_model,
162
+ args=training_args,
163
+ train_dataset=tokenized_train,
164
+ eval_dataset=eval_dataset_processed,
165
+ compute_metrics=compute_metrics if eval_data else None,
166
+ )
167
+
168
+ trainer.train()
169
+ self.hindi_model.save_pretrained("./fine_tuned_hindi_sentiment")
170
+ self.hindi_tokenizer.save_pretrained("./fine_tuned_hindi_sentiment")
171
+
172
+ def preprocess_text(self, text):
173
+ if not text:
174
+ return ""
175
+
176
+ text = emoji.demojize(text, delimiters=(" ", " "))
177
+ text = re.sub(r"http\S+|@\w+", "", text)
178
+
179
+ abbrevs = {
180
+ r"\bomg\b": "oh my god",
181
+ r"\btbh\b": "to be honest",
182
+ r"\bky\b": "kyun",
183
+ r"\bkb\b": "kab",
184
+ r"\bkya\b": "kya",
185
+ r"\bkahan\b": "kahan",
186
+ r"\bkaisa\b": "kaisa"
187
+ }
188
+ for pattern, replacement in abbrevs.items():
189
+ text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
190
+
191
+ return re.sub(r"\s+", " ", text).strip()
192
+
193
+ def detect_language(self, text):
194
+ if re.search(r"[\u0900-\u097F]", text):
195
+ return "hi"
196
+ hinglish_keywords = ["hai", "kyun", "nahi", "kya", "acha", "bas", "yaar", "main"]
197
+ if any(re.search(rf"\b{kw}\b", text.lower()) for kw in hinglish_keywords):
198
+ return "hi-latin"
199
+ return "en"
200
+
201
+ def analyze_content(self, text):
202
+ processed = self.preprocess_text(text)
203
+ if not processed:
204
+ return "neutral", 0.5, {"reason": "empty_text"}
205
+
206
+ lang = self.detect_language(processed)
207
+
208
+ if any(re.search(rf"\b{re.escape(kw)}\b", processed.lower()) for kw in self.neutral_keywords):
209
+ return "neutral", 0.9, {"reason": "neutral_keyword"}
210
+
211
+ try:
212
+ if lang in ("hi", "hi-latin"):
213
+ return self._analyze_hindi_content(processed)
214
+ return self._analyze_english_content(processed)
215
+ except Exception as e:
216
+ print(f"Analysis error: {e}")
217
+ return "neutral", 0.5, {"error": str(e), "original_text": text[:50]}
218
+
219
+ def _analyze_hindi_content(self, text):
220
+ inputs = self.hindi_tokenizer(
221
+ text,
222
+ return_tensors="pt",
223
+ truncation=True,
224
+ padding=True,
225
+ max_length=CONFIG["max_length"]
226
+ ).to(self.device)
227
+
228
+ with torch.no_grad():
229
+ outputs = self.hindi_model(**inputs)
230
+
231
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
232
+ pred_idx = torch.argmax(probs).item()
233
+ confidence = probs[0][pred_idx].item()
234
+ label = self.hindi_model.config.id2label[pred_idx]
235
+ return label, confidence, {"model": "fine-tuned-indic-bert", "lang": "hi"}
236
+
237
+ def _analyze_english_content(self, text):
238
+ # Emotion analysis
239
+ emotion_inputs = self.emotion_tokenizer(
240
+ text,
241
+ return_tensors="pt",
242
+ truncation=True,
243
+ max_length=CONFIG["max_length"]
244
+ ).to(self.device)
245
+
246
+ with torch.no_grad():
247
+ emotion_outputs = self.emotion_model(**emotion_inputs)
248
+
249
+ emotion_probs = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
250
+ emotion_pred = torch.argmax(emotion_probs).item()
251
+ emotion_label = self.emotion_model.config.id2label[emotion_pred]
252
+ emotion_score = emotion_probs[0][emotion_pred].item()
253
+
254
+ # Sentiment analysis
255
+ sentiment_inputs = self.sentiment_tokenizer(
256
+ text,
257
+ return_tensors="pt",
258
+ truncation=True,
259
+ max_length=CONFIG["max_length"]
260
+ ).to(self.device)
261
+
262
+ with torch.no_grad():
263
+ sentiment_outputs = self.sentiment_model(**sentiment_inputs)
264
+
265
+ sentiment_probs = torch.nn.functional.softmax(sentiment_outputs.logits, dim=-1)
266
+ sentiment_pred = torch.argmax(sentiment_probs).item()
267
+ sentiment_label_mapping = {0: 'negative', 1: 'neutral', 2: 'positive'}
268
+ sentiment_label = sentiment_label_mapping.get(sentiment_pred, 'neutral')
269
+ sentiment_score = sentiment_probs[0][sentiment_pred].item()
270
+
271
+ # Combine results
272
+ mapped_emotion = self.emotion_map.get(emotion_label, "neutral")
273
+
274
+ if sentiment_score > CONFIG["confidence_threshold"]:
275
+ final_label = sentiment_label
276
+ final_confidence = sentiment_score
277
+ reason = "high_sentiment_confidence"
278
+ elif emotion_score > CONFIG["confidence_threshold"] and mapped_emotion != "neutral":
279
+ final_label = mapped_emotion
280
+ final_confidence = emotion_score
281
+ reason = "high_emotion_confidence"
282
+ else:
283
+ if sentiment_label == mapped_emotion and sentiment_label != "neutral":
284
+ final_label = sentiment_label
285
+ final_confidence = (sentiment_score + emotion_score) / 2
286
+ reason = "emotion_sentiment_agreement"
287
+ elif sentiment_label != "neutral" and sentiment_score > emotion_score and sentiment_score > 0.4:
288
+ final_label = sentiment_label
289
+ final_confidence = sentiment_score * 0.9
290
+ reason = "sentiment_slightly_higher"
291
+ elif mapped_emotion != "neutral" and emotion_score > sentiment_score and emotion_score > 0.4:
292
+ final_label = mapped_emotion
293
+ final_confidence = emotion_score * 0.9
294
+ reason = "emotion_slightly_higher"
295
+ else:
296
+ final_label = "neutral"
297
+ final_confidence = 0.6
298
+ reason = "fallback_to_neutral"
299
+
300
+ return final_label, final_confidence, {
301
+ "emotion_label": emotion_label,
302
+ "emotion_score": emotion_score,
303
+ "sentiment_label": sentiment_label,
304
+ "sentiment_score": sentiment_score,
305
+ "mapped_emotion": mapped_emotion,
306
+ "model": "ensemble",
307
+ "lang": "en",
308
+ "reason": reason
309
+ }
310
+
311
+ def analyze_reels(self, reels, max_to_analyze=100):
312
+ print(f"Analyzing {max_to_analyze} reels...")
313
+ results = Counter()
314
+ detailed_results = []
315
+
316
+ for i, reel in enumerate(reels[:max_to_analyze], 1):
317
+ caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
318
+ label, confidence, details = self.analyze_content(caption)
319
+ results[label] += 1
320
+ detailed_results.append({
321
+ "reel_id": reel.id,
322
+ "text": caption,
323
+ "label": label,
324
+ "confidence": confidence,
325
+ "details": details
326
+ })
327
+
328
+ if sum(results.values()) > 0 and results["neutral"] / sum(results.values()) > CONFIG["neutral_reanalysis_threshold"]:
329
+ self._reduce_neutrals(results, detailed_results)
330
+
331
+ return results, detailed_results
332
+
333
+ def _reduce_neutrals(self, results, detailed_results):
334
+ neutrals_to_recheck = [item for item in detailed_results if item["label"] == "neutral" and item["confidence"] < 0.8]
335
+
336
+ for item in neutrals_to_recheck:
337
+ text_lower = self.preprocess_text(item["text"]).lower()
338
+ pos_keywords = {"amazing", "love", "best", "fantastic", "awesome", "superb", "great"}
339
+ neg_keywords = {"hate", "worst", "bad", "terrible", "awful", "disappointed", "horrible", "cringe"}
340
+
341
+ is_strong_pos = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in pos_keywords)
342
+ is_strong_neg = any(re.search(rf"\b{re.escape(kw)}\b", text_lower) for kw in neg_keywords)
343
+
344
+ if is_strong_pos and not is_strong_neg:
345
+ results["neutral"] -= 1
346
+ results["positive"] += 1
347
+ item.update({
348
+ "label": "positive",
349
+ "confidence": min(0.95, item["confidence"] + 0.3),
350
+ "reanalyzed": True,
351
+ "reanalysis_reason": "strong_pos_keywords"
352
+ })
353
+ elif is_strong_neg and not is_strong_pos:
354
+ results["neutral"] -= 1
355
+ results["negative"] += 1
356
+ item.update({
357
+ "label": "negative",
358
+ "confidence": min(0.95, item["confidence"] + 0.3),
359
+ "reanalyzed": True,
360
+ "reanalysis_reason": "strong_neg_keywords"
361
+ })
362
+
363
+ def plot_sentiment_pie(results, title="Reels Sentiment Analysis"):
364
+ sizes = [results.get('positive', 0), results.get('neutral', 0), results.get('negative', 0)]
365
+ if sum(sizes) == 0:
366
+ return None
367
+
368
+ labels = ['Positive', 'Neutral', 'Negative']
369
+ colors = ['#4CAF50', '#FFC107', '#F44336']
370
+ explode = (0.05, 0, 0.05)
371
+
372
+ fig, ax = plt.subplots(figsize=(8, 6))
373
+ ax.pie(sizes, explode=explode, labels=labels, colors=colors,
374
+ autopct='%1.1f%%', shadow=True, startangle=140,
375
+ textprops={'fontsize': 12, 'color': 'black'})
376
+ ax.axis('equal')
377
+ plt.title(title, fontsize=16, pad=20)
378
+ plt.tight_layout()
379
+ return fig
380
+
381
+ def plot_category_distribution(counter, title="Reels Content Distribution"):
382
+ total = sum(counter.values())
383
+ if total == 0:
384
+ return None
385
+
386
+ threshold = total * 0.02
387
+ other_count = 0
388
+ labels = []
389
+ sizes = []
390
+
391
+ for category, count in counter.most_common():
392
+ if count >= threshold and category != "other":
393
+ labels.append(category.replace('_', ' ').title())
394
+ sizes.append(count)
395
+ else:
396
+ other_count += count
397
+
398
+ if other_count > 0:
399
+ labels.append("Other")
400
+ sizes.append(other_count)
401
+
402
+ if not sizes:
403
+ return None
404
+
405
+ fig, ax = plt.subplots(figsize=(10, 8))
406
+ colors = plt.cm.viridis(np.linspace(0, 1, len(sizes)))
407
+ ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors,
408
+ wedgeprops={'edgecolor': 'white', 'linewidth': 1}, textprops={'fontsize': 11})
409
+ plt.title(title, pad=20, fontsize=15)
410
+ plt.axis('equal')
411
+ plt.tight_layout()
412
+ return fig
413
+
414
+ def preprocess_text_cat(text):
415
+ if not text:
416
+ return ""
417
+ text = re.sub(r"http\S+|@\w+|#\w+", "", text).lower()
418
+ return re.sub(r"\s+", " ", text).strip()
419
+
420
+ def classify_reel_content(text):
421
+ global content_classifier
422
+
423
+ processed = preprocess_text_cat(text)
424
+ if not processed or len(processed.split()) < 2:
425
+ return "other", {"reason": "short_text"}
426
+
427
+ for category, keywords in CATEGORY_KEYWORDS.items():
428
+ if any(re.search(rf"\b{re.escape(keyword)}\b", processed) for keyword in keywords):
429
+ return category, {"reason": "keyword_match"}
430
+
431
+ if content_classifier is None:
432
+ return "other", {"reason": "classifier_not_initialized"}
433
+
434
+ try:
435
+ result = content_classifier(processed[:256], CONTENT_CATEGORIES, multi_label=False)
436
+ top_label = result['labels'][0]
437
+ top_score = result['scores'][0]
438
+ return top_label if top_score > 0.5 else "other", {"reason": "model_prediction", "score": top_score}
439
+ except Exception as e:
440
+ print(f"Classification error: {e}")
441
+ return "other", {"reason": "classification_error"}
442
+
443
+ # Gradio Interface Functions
444
+ def login_gradio_auto():
445
+ global cl
446
+ try:
447
+ PASSWORD = "qwerty@desk" # Replace with your actual password
448
+ except Exception as e:
449
+ return f"Error accessing password: {e}", gr.update(visible=False)
450
+
451
+ if not PASSWORD:
452
+ return "Error: Instagram password not found.", gr.update(visible=False)
453
+
454
+ cl = Client()
455
+ try:
456
+ cl.login("jattman1993", PASSWORD)
457
+ return f"Successfully logged in as jattman1993", gr.update(visible=False)
458
+ except Exception as e:
459
+ cl = None
460
+ error_message = str(e)
461
+ if "Two factor challenged" in error_message or "challenge_required" in error_message:
462
+ return f"Login failed: Two-factor authentication required.", gr.update(visible=True)
463
+ return f"Error during login: {error_message}", gr.update(visible=False)
464
+
465
+ def submit_otp_gradio(otp_code):
466
+ global cl
467
+ if cl is None:
468
+ return "Error: Not logged in.", "", gr.update(visible=False)
469
+
470
+ try:
471
+ cl.two_factor_login(otp_code)
472
+ return f"OTP successful. Logged in as jattman1993.", "", gr.update(visible=False)
473
+ except Exception as e:
474
+ return f"OTP failed: {e}", "", gr.update(visible=True)
475
+
476
+ def fetch_reels_gradio():
477
+ global cl, explore_reels_list
478
+ if cl is None:
479
+ explore_reels_list = []
480
+ return "Error: Not logged in."
481
+
482
+ try:
483
+ explore_reels_list = cl.explore_reels()[:100]
484
+ return f"Fetched {len(explore_reels_list)} reels."
485
+ except Exception as e:
486
+ explore_reels_list = []
487
+ return f"Error fetching reels: {e}"
488
+
489
+ def analyze_reels_gradio(max_to_analyze):
490
+ global explore_reels_list, sentiment_analyzer, content_classifier
491
+
492
+ if not explore_reels_list:
493
+ return "Error: No reels fetched.", None, None
494
+
495
+ num_reels = min(max_to_analyze, len(explore_reels_list))
496
+ reels_to_analyze = explore_reels_list[:num_reels]
497
+
498
+ if sentiment_analyzer is None:
499
+ sentiment_analyzer = ReelSentimentAnalyzer()
500
+
501
+ if content_classifier is None:
502
+ content_classifier = pipeline(
503
+ "zero-shot-classification",
504
+ model="facebook/bart-large-mnli",
505
+ device=0 if torch.cuda.is_available() else -1
506
+ )
507
+
508
+ status_messages = []
509
+ sentiment_plot = None
510
+ content_plot = None
511
+
512
+ # Sentiment Analysis
513
+ try:
514
+ sentiment_results, _ = sentiment_analyzer.analyze_reels(reels_to_analyze)
515
+ sentiment_plot = plot_sentiment_pie(sentiment_results)
516
+ status_messages.append("Sentiment analysis complete.")
517
+ except Exception as e:
518
+ status_messages.append(f"Sentiment error: {e}")
519
+
520
+ # Content Analysis
521
+ try:
522
+ category_counts = Counter()
523
+ for reel in reels_to_analyze:
524
+ caption = getattr(reel, 'caption_text', '') or getattr(reel, 'caption', '') or ''
525
+ category, _ = classify_reel_content(caption)
526
+ category_counts[category] += 1
527
+ content_plot = plot_category_distribution(category_counts)
528
+ status_messages.append("Content analysis complete.")
529
+ except Exception as e:
530
+ status_messages.append(f"Content error: {e}")
531
+
532
+ return "\n".join(status_messages), sentiment_plot, content_plot
533
+
534
+ # Gradio Interface
535
+ with gr.Blocks() as demo:
536
+ gr.Markdown("# Instagram Reels Analysis")
537
+
538
+ # Login Section
539
+ with gr.Row():
540
+ connect_btn = gr.Button("Connect Instagram")
541
+ login_status = gr.Label(label="Login Status")
542
+
543
+ # OTP Input (hidden initially)
544
+ with gr.Row(visible=False) as otp_row:
545
+ otp_input = gr.Textbox(label="Enter OTP Code")
546
+ otp_submit_btn = gr.Button("Submit OTP")
547
+
548
+ # Fetch Section
549
+ with gr.Row():
550
+ fetch_btn = gr.Button("Fetch Reels")
551
+ fetch_status = gr.Label(label="Fetch Status")
552
+
553
+ # Analysis Section
554
+ with gr.Row():
555
+ max_reels = gr.Slider(1, 100, value=10, step=1, label="Number of Reels to Analyze")
556
+ analyze_btn = gr.Button("Analyze Reels")
557
+ analyze_status = gr.Label(label="Analysis Status")
558
+
559
+ # Results Section
560
+ with gr.Row():
561
+ with gr.Column():
562
+ gr.Markdown("## Sentiment Analysis")
563
+ sentiment_output = gr.Plot(label="Sentiment Distribution")
564
+ with gr.Column():
565
+ gr.Markdown("## Content Analysis")
566
+ content_output = gr.Plot(label="Content Distribution")
567
+
568
+ # Event handlers
569
+ connect_btn.click(
570
+ login_gradio_auto,
571
+ inputs=None,
572
+ outputs=[login_status, otp_row]
573
+ )
574
+ otp_submit_btn.click(
575
+ submit_otp_gradio,
576
+ inputs=otp_input,
577
+ outputs=[login_status, otp_input, otp_row]
578
+ )
579
+ fetch_btn.click(
580
+ fetch_reels_gradio,
581
+ inputs=None,
582
+ outputs=fetch_status
583
+ )
584
+ analyze_btn.click(
585
+ analyze_reels_gradio,
586
+ inputs=max_reels,
587
+ outputs=[analyze_status, sentiment_output, content_output]
588
+ )
589
+
590
+ if __name__ == "__main__":
591
+ demo.launch()