jaisun2004 commited on
Commit
9b33471
·
verified ·
1 Parent(s): 3d1b3f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -56
app.py CHANGED
@@ -4,6 +4,7 @@ from langdetect import detect
4
  from transformers import pipeline
5
  from keybert import KeyBERT
6
  import os
 
7
  import re
8
 
9
  # --- SETUP ---
@@ -12,58 +13,32 @@ openai.api_key = os.getenv("OPENAI_API_KEY") # Set in HF Space Secrets
12
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
13
  kw_model = KeyBERT()
14
 
15
- # Key Indian brokerages, investment apps, and fintech brands
 
 
 
 
 
 
 
 
 
16
  BRANDS = [
17
- "Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal","Motilal","Sherkhan" "Sharekhan", "5paisa", "ICICI Direct",
18
  "HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
19
- "Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney", "IND money", "Small Case"
20
  "PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
21
- "Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential", "Wazir", "India Infoline", "Anand Rathi"
22
  ]
23
 
 
24
  NEGATIVE_KEYWORDS = [
25
- kw.lower() for kw in [
26
- "Assuring return", "Invest with us and earn", "Profit Share", "Password share",
27
- "Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
28
- "False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
29
- "Impersonation angel broking", "Fraud cheat", "Portfolio Management Service",
30
- "Guarantee return", "Guaranteed return", "Tampered document", "Fake document",
31
- "Forged document", "Promising huge return", "Ponzi Dabba", "Synchronised trade",
32
- "Made huge profit", "Siphon amount", "Strategy During Market at Angel One",
33
- "Account Handling", "Fixed Income from Market", "Weekly Expiry Make Money",
34
- "Imposing Angel One", "Profit Share", "Profit Sharing", "Password Sharing",
35
- "Password Share", "Unauthorized Trade", "Advisory Services", "Fake Avdisory",
36
- "Arrest", "Took Money", "Fraud", "Cheat", "Portfolio Management Services", "PMS",
37
- "Gurantee Return", "Guranteed Return", "Huge Return", "Ponzi", "Dabba",
38
- "Make Huge Profit", "Siphon Amount", "Accout Handling", "Account Handling Services",
39
- "Weekly Expire Make Money", "Account Handling聽", "Account Handle", "huge profit",
40
- "advisor", "advisory", "assured return", "Premium Advice", "Free Advice",
41
- "Free Advisory", "Life time free paid calls", "free paid calls", "paid calls",
42
- "premium advisory", "Get Free Advice", "free calls with accuracy", "Free calls",
43
- "Options Intraday Tips", "Equity call Intraday", "Equity call Intraday & Delivery",
44
- "Equity call Delivery", "Premium advisor", "Gurantee Return Services",
45
- "Guranteed Return Services", "advisor Services", "assured return Services",
46
- "Premium Advice Services", "Free Advice Services", "Free Advisory Services",
47
- "Life time free paid calls Services", "free paid calls Services", "paid calls Services",
48
- "premium advisory Services", "Stock Recommendation", "Amount Doubling",
49
- "Best Trade Level In Nifty, Bank Nifty With Accuracy", "Daily Accurate Calls",
50
- "Earn Profit", "Expert Calls", "Fixe Profit Commitment", "Fixed Return", "For Jackpot Trade",
51
- "Good Profits Daily", "Guaranteed Profit", "Paid Investment Plans", "Jackpot Call",
52
- "Loss & Profit Sharing", "Nifty Bank-Nifty And Stock Option Calls .", "Pay & Get (Amount)",
53
- "Sure Shot Calls", "Tips Provide", "Stock tips", "losses", "stock picks", "Multibagger picks",
54
- "High return on investmentInsider Trading Offer/Scheme", "Advance Fee Fraud", "Pyramid Scheme",
55
- "Boiler Room Scam", "Municipal Securities updates", "Churning offers", "Front Running Amount",
56
- "Wash Trading Amount", "Bear Raiding", "Account Takeover", "Binary Options",
57
- "Unregistered Securities", "High-Yield Investment Program", "Forex Amount", "Smurfing offers",
58
- "Invest Quickly", "Trading account opening offer", "Discount on trading account",
59
- "Bonus on Opening account", "Bull Capturing", "Confirmed Swing Options",
60
- "Get Dividend every month", "Penny Stock recommendation", "Bawaal Stock Dhamaal return",
61
- "From thousand to Crores portfolio", "Multibagger stock tips", "Best Over sold stocks",
62
- "Best Over bought stocks", "High dividend yield stocks", "Future stock recommendation",
63
- "Growth scanners", "Growth Screeners", "Bullish stock recommendation", "Bull stocks recommendation",
64
- "Bearish stock recommendation", "Bear stocks recommendation"
65
- ]
66
  ]
 
67
 
68
  def extract_brands(text):
69
  found = [brand for brand in BRANDS if brand.lower() in text.lower()]
@@ -103,9 +78,28 @@ def make_str(val):
103
  except Exception:
104
  return ""
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  def process_audio(audio_path):
107
  if not audio_path or not isinstance(audio_path, str):
108
- return ("No audio file provided.", "", "", "", "", "", "")
109
  try:
110
  with open(audio_path, "rb") as audio_file:
111
  transcript = openai.audio.transcriptions.create(
@@ -115,7 +109,7 @@ def process_audio(audio_path):
115
  )
116
  transcript = make_str(transcript).strip()
117
  except Exception as e:
118
- return (f"Error in transcription: {e}", "", "", "", "", "", "")
119
  try:
120
  detected_lang = detect(transcript)
121
  lang_text = {'en': 'English', 'hi': 'Hindi', 'ta': 'Tamil'}.get(detected_lang, detected_lang)
@@ -133,15 +127,30 @@ def process_audio(audio_path):
133
  transcript_en = make_str(transcript_en).strip()
134
  except Exception as e:
135
  transcript_en = f"Error translating: {e}"
136
- try:
137
- summary_obj = summarizer(transcript_en, max_length=100, min_length=30, do_sample=False)
138
- summary = summary_obj[0]["summary_text"] if isinstance(summary_obj, list) and "summary_text" in summary_obj[0] else make_str(summary_obj)
139
- except Exception as e:
140
- summary = f"Error summarizing: {e}"
141
  brands = extract_brands(transcript_en)
142
  topics = extract_topics(transcript_en)
143
  key_takeaways = make_bullets(summary)
144
  negatives = extract_negative_keywords(transcript_en)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  return (
146
  lang_text,
147
  transcript,
@@ -149,7 +158,10 @@ def process_audio(audio_path):
149
  ", ".join(brands),
150
  ", ".join(topics),
151
  key_takeaways,
152
- ", ".join(negatives)
 
 
 
153
  )
154
 
155
  iface = gr.Interface(
@@ -162,10 +174,16 @@ iface = gr.Interface(
162
  gr.Textbox(label="Indian Brokerages & Fintech Brands Detected"),
163
  gr.Textbox(label="Key Topics"),
164
  gr.Textbox(label="Bulleted Key Takeaways"),
165
- gr.Textbox(label="Negative Keywords Detected")
 
 
 
 
 
 
166
  ],
167
- title="Audio Brand, Topic, and Scam Keyword Analysis for Indian Finance Apps",
168
- description="Upload your audio file (MP3/WAV). Get transcript, summary, Indian brokerage/fintech brand & scam keyword detection, key topics, and a bulleted summary. Powered by OpenAI Whisper and BART."
169
  )
170
 
171
  iface.launch()
 
4
  from transformers import pipeline
5
  from keybert import KeyBERT
6
  import os
7
+ import pandas as pd
8
  import re
9
 
10
  # --- SETUP ---
 
13
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
14
  kw_model = KeyBERT()
15
 
16
+ # 1. LOAD EXCEL DATA
17
+ EXCEL_PATH = "/mnt/data/Active AP list - 05.12.2024 4.xlsx"
18
+ ap_df = pd.read_excel(EXCEL_PATH)
19
+ ap_df.columns = [c.strip() for c in ap_df.columns] # strip spaces
20
+
21
+ SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna())
22
+ MOBILE_SET = set(str(x).strip() for x in ap_df["MOBILE NO."].dropna())
23
+ EMAIL_SET = set(str(x).strip().lower() for x in ap_df["E Mail id"].dropna())
24
+
25
+ # Brands
26
  BRANDS = [
27
+ "Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
28
  "HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
29
+ "Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
30
  "PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
31
+ "Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential", "Paytm","Motilal","Ind Money","Anand Rathi","Sherkhan"
32
  ]
33
 
34
+ # Negative keywords (for brevity, only a few—add your full list as needed)
35
  NEGATIVE_KEYWORDS = [
36
+ "assuring return", "profit share", "fake advisor", "unauthorized trade", "guarantee return",
37
+ "guaranteed return", "tampered document", "fake document", "fraud", "cheat", "ponzi", "dabba", "advisor",
38
+ "advisory", "assured return", "premium advice", "free advice", "paid calls", "stock tips", "multibagger"
39
+ # ... add the rest from your list ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  ]
41
+ NEGATIVE_KEYWORDS = [kw.lower() for kw in NEGATIVE_KEYWORDS]
42
 
43
  def extract_brands(text):
44
  found = [brand for brand in BRANDS if brand.lower() in text.lower()]
 
78
  except Exception:
79
  return ""
80
 
81
+ def extract_phone_numbers(text):
82
+ pattern = r'(\+91[\-\s]?)?[6-9]\d{9}'
83
+ matches = re.findall(pattern, text)
84
+ cleaned = []
85
+ for m in matches:
86
+ # The full match may not be a whole number; reconstruct from match
87
+ possible = ''.join(re.findall(r'\d', m))
88
+ if len(possible) >= 10:
89
+ cleaned.append(possible[-10:]) # last 10 digits
90
+ return list(set(cleaned))
91
+
92
+ def extract_emails(text):
93
+ return list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
94
+
95
+ def extract_broker_codes(text):
96
+ # Typical AP codes are 2-4 uppercase letters + 4-6 digits
97
+ codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
98
+ return list(set(codes))
99
+
100
  def process_audio(audio_path):
101
  if not audio_path or not isinstance(audio_path, str):
102
+ return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "")
103
  try:
104
  with open(audio_path, "rb") as audio_file:
105
  transcript = openai.audio.transcriptions.create(
 
109
  )
110
  transcript = make_str(transcript).strip()
111
  except Exception as e:
112
+ return (f"Error in transcription: {e}", "", "", "", "", "", "", "", "", "", "")
113
  try:
114
  detected_lang = detect(transcript)
115
  lang_text = {'en': 'English', 'hi': 'Hindi', 'ta': 'Tamil'}.get(detected_lang, detected_lang)
 
127
  transcript_en = make_str(transcript_en).strip()
128
  except Exception as e:
129
  transcript_en = f"Error translating: {e}"
130
+
131
+ # Analysis
132
+ summary_obj = summarizer(transcript_en, max_length=100, min_length=30, do_sample=False)
133
+ summary = summary_obj[0]["summary_text"] if isinstance(summary_obj, list) and "summary_text" in summary_obj[0] else make_str(summary_obj)
 
134
  brands = extract_brands(transcript_en)
135
  topics = extract_topics(transcript_en)
136
  key_takeaways = make_bullets(summary)
137
  negatives = extract_negative_keywords(transcript_en)
138
+
139
+ # Extraction
140
+ phones = extract_phone_numbers(transcript_en)
141
+ emails = extract_emails(transcript_en)
142
+ codes = extract_broker_codes(transcript_en)
143
+
144
+ # MATCHING
145
+ matched_codes = sorted(set([c for c in codes if c in SB_TAG_SET]))
146
+ unmatched_codes = sorted(set([c for c in codes if c not in SB_TAG_SET]))
147
+
148
+ matched_phones = sorted(set([p for p in phones if p in MOBILE_SET]))
149
+ unmatched_phones = sorted(set([p for p in phones if p not in MOBILE_SET]))
150
+
151
+ matched_emails = sorted(set([e for e in emails if e.lower() in EMAIL_SET]))
152
+ unmatched_emails = sorted(set([e for e in emails if e.lower() not in EMAIL_SET]))
153
+
154
  return (
155
  lang_text,
156
  transcript,
 
158
  ", ".join(brands),
159
  ", ".join(topics),
160
  key_takeaways,
161
+ ", ".join(negatives),
162
+ ", ".join(matched_codes), ", ".join(unmatched_codes),
163
+ ", ".join(matched_phones), ", ".join(unmatched_phones),
164
+ ", ".join(matched_emails), ", ".join(unmatched_emails)
165
  )
166
 
167
  iface = gr.Interface(
 
174
  gr.Textbox(label="Indian Brokerages & Fintech Brands Detected"),
175
  gr.Textbox(label="Key Topics"),
176
  gr.Textbox(label="Bulleted Key Takeaways"),
177
+ gr.Textbox(label="Negative Keywords Detected"),
178
+ gr.Textbox(label="Matched AP Codes"),
179
+ gr.Textbox(label="Unmatched AP Codes"),
180
+ gr.Textbox(label="Matched Phone Numbers"),
181
+ gr.Textbox(label="Unmatched Phone Numbers"),
182
+ gr.Textbox(label="Matched Email IDs"),
183
+ gr.Textbox(label="Unmatched Email IDs"),
184
  ],
185
+ title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
186
+ description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list."
187
  )
188
 
189
  iface.launch()