Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ from langdetect import detect
|
|
4 |
from transformers import pipeline
|
5 |
from keybert import KeyBERT
|
6 |
import os
|
|
|
7 |
import re
|
8 |
|
9 |
# --- SETUP ---
|
@@ -12,58 +13,32 @@ openai.api_key = os.getenv("OPENAI_API_KEY") # Set in HF Space Secrets
|
|
12 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
13 |
kw_model = KeyBERT()
|
14 |
|
15 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
BRANDS = [
|
17 |
-
"Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal",
|
18 |
"HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
|
19 |
-
"Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
|
20 |
"PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
|
21 |
-
"Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential", "
|
22 |
]
|
23 |
|
|
|
24 |
NEGATIVE_KEYWORDS = [
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
"Impersonation angel broking", "Fraud cheat", "Portfolio Management Service",
|
30 |
-
"Guarantee return", "Guaranteed return", "Tampered document", "Fake document",
|
31 |
-
"Forged document", "Promising huge return", "Ponzi Dabba", "Synchronised trade",
|
32 |
-
"Made huge profit", "Siphon amount", "Strategy During Market at Angel One",
|
33 |
-
"Account Handling", "Fixed Income from Market", "Weekly Expiry Make Money",
|
34 |
-
"Imposing Angel One", "Profit Share", "Profit Sharing", "Password Sharing",
|
35 |
-
"Password Share", "Unauthorized Trade", "Advisory Services", "Fake Avdisory",
|
36 |
-
"Arrest", "Took Money", "Fraud", "Cheat", "Portfolio Management Services", "PMS",
|
37 |
-
"Gurantee Return", "Guranteed Return", "Huge Return", "Ponzi", "Dabba",
|
38 |
-
"Make Huge Profit", "Siphon Amount", "Accout Handling", "Account Handling Services",
|
39 |
-
"Weekly Expire Make Money", "Account Handling聽", "Account Handle", "huge profit",
|
40 |
-
"advisor", "advisory", "assured return", "Premium Advice", "Free Advice",
|
41 |
-
"Free Advisory", "Life time free paid calls", "free paid calls", "paid calls",
|
42 |
-
"premium advisory", "Get Free Advice", "free calls with accuracy", "Free calls",
|
43 |
-
"Options Intraday Tips", "Equity call Intraday", "Equity call Intraday & Delivery",
|
44 |
-
"Equity call Delivery", "Premium advisor", "Gurantee Return Services",
|
45 |
-
"Guranteed Return Services", "advisor Services", "assured return Services",
|
46 |
-
"Premium Advice Services", "Free Advice Services", "Free Advisory Services",
|
47 |
-
"Life time free paid calls Services", "free paid calls Services", "paid calls Services",
|
48 |
-
"premium advisory Services", "Stock Recommendation", "Amount Doubling",
|
49 |
-
"Best Trade Level In Nifty, Bank Nifty With Accuracy", "Daily Accurate Calls",
|
50 |
-
"Earn Profit", "Expert Calls", "Fixe Profit Commitment", "Fixed Return", "For Jackpot Trade",
|
51 |
-
"Good Profits Daily", "Guaranteed Profit", "Paid Investment Plans", "Jackpot Call",
|
52 |
-
"Loss & Profit Sharing", "Nifty Bank-Nifty And Stock Option Calls .", "Pay & Get (Amount)",
|
53 |
-
"Sure Shot Calls", "Tips Provide", "Stock tips", "losses", "stock picks", "Multibagger picks",
|
54 |
-
"High return on investmentInsider Trading Offer/Scheme", "Advance Fee Fraud", "Pyramid Scheme",
|
55 |
-
"Boiler Room Scam", "Municipal Securities updates", "Churning offers", "Front Running Amount",
|
56 |
-
"Wash Trading Amount", "Bear Raiding", "Account Takeover", "Binary Options",
|
57 |
-
"Unregistered Securities", "High-Yield Investment Program", "Forex Amount", "Smurfing offers",
|
58 |
-
"Invest Quickly", "Trading account opening offer", "Discount on trading account",
|
59 |
-
"Bonus on Opening account", "Bull Capturing", "Confirmed Swing Options",
|
60 |
-
"Get Dividend every month", "Penny Stock recommendation", "Bawaal Stock Dhamaal return",
|
61 |
-
"From thousand to Crores portfolio", "Multibagger stock tips", "Best Over sold stocks",
|
62 |
-
"Best Over bought stocks", "High dividend yield stocks", "Future stock recommendation",
|
63 |
-
"Growth scanners", "Growth Screeners", "Bullish stock recommendation", "Bull stocks recommendation",
|
64 |
-
"Bearish stock recommendation", "Bear stocks recommendation"
|
65 |
-
]
|
66 |
]
|
|
|
67 |
|
68 |
def extract_brands(text):
|
69 |
found = [brand for brand in BRANDS if brand.lower() in text.lower()]
|
@@ -103,9 +78,28 @@ def make_str(val):
|
|
103 |
except Exception:
|
104 |
return ""
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
def process_audio(audio_path):
|
107 |
if not audio_path or not isinstance(audio_path, str):
|
108 |
-
return ("No audio file provided.", "", "", "", "", "", "")
|
109 |
try:
|
110 |
with open(audio_path, "rb") as audio_file:
|
111 |
transcript = openai.audio.transcriptions.create(
|
@@ -115,7 +109,7 @@ def process_audio(audio_path):
|
|
115 |
)
|
116 |
transcript = make_str(transcript).strip()
|
117 |
except Exception as e:
|
118 |
-
return (f"Error in transcription: {e}", "", "", "", "", "", "")
|
119 |
try:
|
120 |
detected_lang = detect(transcript)
|
121 |
lang_text = {'en': 'English', 'hi': 'Hindi', 'ta': 'Tamil'}.get(detected_lang, detected_lang)
|
@@ -133,15 +127,30 @@ def process_audio(audio_path):
|
|
133 |
transcript_en = make_str(transcript_en).strip()
|
134 |
except Exception as e:
|
135 |
transcript_en = f"Error translating: {e}"
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
summary = f"Error summarizing: {e}"
|
141 |
brands = extract_brands(transcript_en)
|
142 |
topics = extract_topics(transcript_en)
|
143 |
key_takeaways = make_bullets(summary)
|
144 |
negatives = extract_negative_keywords(transcript_en)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
return (
|
146 |
lang_text,
|
147 |
transcript,
|
@@ -149,7 +158,10 @@ def process_audio(audio_path):
|
|
149 |
", ".join(brands),
|
150 |
", ".join(topics),
|
151 |
key_takeaways,
|
152 |
-
", ".join(negatives)
|
|
|
|
|
|
|
153 |
)
|
154 |
|
155 |
iface = gr.Interface(
|
@@ -162,10 +174,16 @@ iface = gr.Interface(
|
|
162 |
gr.Textbox(label="Indian Brokerages & Fintech Brands Detected"),
|
163 |
gr.Textbox(label="Key Topics"),
|
164 |
gr.Textbox(label="Bulleted Key Takeaways"),
|
165 |
-
gr.Textbox(label="Negative Keywords Detected")
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
],
|
167 |
-
title="Audio Brand,
|
168 |
-
description="Upload
|
169 |
)
|
170 |
|
171 |
iface.launch()
|
|
|
4 |
from transformers import pipeline
|
5 |
from keybert import KeyBERT
|
6 |
import os
|
7 |
+
import pandas as pd
|
8 |
import re
|
9 |
|
10 |
# --- SETUP ---
|
|
|
13 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
14 |
kw_model = KeyBERT()
|
15 |
|
16 |
+
# 1. LOAD EXCEL DATA
|
17 |
+
EXCEL_PATH = "/mnt/data/Active AP list - 05.12.2024 4.xlsx"
|
18 |
+
ap_df = pd.read_excel(EXCEL_PATH)
|
19 |
+
ap_df.columns = [c.strip() for c in ap_df.columns] # strip spaces
|
20 |
+
|
21 |
+
SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna())
|
22 |
+
MOBILE_SET = set(str(x).strip() for x in ap_df["MOBILE NO."].dropna())
|
23 |
+
EMAIL_SET = set(str(x).strip().lower() for x in ap_df["E Mail id"].dropna())
|
24 |
+
|
25 |
+
# Brands
|
26 |
BRANDS = [
|
27 |
+
"Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
|
28 |
"HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
|
29 |
+
"Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
|
30 |
"PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
|
31 |
+
"Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential", "Paytm","Motilal","Ind Money","Anand Rathi","Sherkhan"
|
32 |
]
|
33 |
|
34 |
+
# Negative keywords (for brevity, only a few—add your full list as needed)
|
35 |
NEGATIVE_KEYWORDS = [
|
36 |
+
"assuring return", "profit share", "fake advisor", "unauthorized trade", "guarantee return",
|
37 |
+
"guaranteed return", "tampered document", "fake document", "fraud", "cheat", "ponzi", "dabba", "advisor",
|
38 |
+
"advisory", "assured return", "premium advice", "free advice", "paid calls", "stock tips", "multibagger"
|
39 |
+
# ... add the rest from your list ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
]
|
41 |
+
NEGATIVE_KEYWORDS = [kw.lower() for kw in NEGATIVE_KEYWORDS]
|
42 |
|
43 |
def extract_brands(text):
|
44 |
found = [brand for brand in BRANDS if brand.lower() in text.lower()]
|
|
|
78 |
except Exception:
|
79 |
return ""
|
80 |
|
81 |
+
def extract_phone_numbers(text):
|
82 |
+
pattern = r'(\+91[\-\s]?)?[6-9]\d{9}'
|
83 |
+
matches = re.findall(pattern, text)
|
84 |
+
cleaned = []
|
85 |
+
for m in matches:
|
86 |
+
# The full match may not be a whole number; reconstruct from match
|
87 |
+
possible = ''.join(re.findall(r'\d', m))
|
88 |
+
if len(possible) >= 10:
|
89 |
+
cleaned.append(possible[-10:]) # last 10 digits
|
90 |
+
return list(set(cleaned))
|
91 |
+
|
92 |
+
def extract_emails(text):
|
93 |
+
return list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
|
94 |
+
|
95 |
+
def extract_broker_codes(text):
|
96 |
+
# Typical AP codes are 2-4 uppercase letters + 4-6 digits
|
97 |
+
codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
|
98 |
+
return list(set(codes))
|
99 |
+
|
100 |
def process_audio(audio_path):
|
101 |
if not audio_path or not isinstance(audio_path, str):
|
102 |
+
return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "")
|
103 |
try:
|
104 |
with open(audio_path, "rb") as audio_file:
|
105 |
transcript = openai.audio.transcriptions.create(
|
|
|
109 |
)
|
110 |
transcript = make_str(transcript).strip()
|
111 |
except Exception as e:
|
112 |
+
return (f"Error in transcription: {e}", "", "", "", "", "", "", "", "", "", "")
|
113 |
try:
|
114 |
detected_lang = detect(transcript)
|
115 |
lang_text = {'en': 'English', 'hi': 'Hindi', 'ta': 'Tamil'}.get(detected_lang, detected_lang)
|
|
|
127 |
transcript_en = make_str(transcript_en).strip()
|
128 |
except Exception as e:
|
129 |
transcript_en = f"Error translating: {e}"
|
130 |
+
|
131 |
+
# Analysis
|
132 |
+
summary_obj = summarizer(transcript_en, max_length=100, min_length=30, do_sample=False)
|
133 |
+
summary = summary_obj[0]["summary_text"] if isinstance(summary_obj, list) and "summary_text" in summary_obj[0] else make_str(summary_obj)
|
|
|
134 |
brands = extract_brands(transcript_en)
|
135 |
topics = extract_topics(transcript_en)
|
136 |
key_takeaways = make_bullets(summary)
|
137 |
negatives = extract_negative_keywords(transcript_en)
|
138 |
+
|
139 |
+
# Extraction
|
140 |
+
phones = extract_phone_numbers(transcript_en)
|
141 |
+
emails = extract_emails(transcript_en)
|
142 |
+
codes = extract_broker_codes(transcript_en)
|
143 |
+
|
144 |
+
# MATCHING
|
145 |
+
matched_codes = sorted(set([c for c in codes if c in SB_TAG_SET]))
|
146 |
+
unmatched_codes = sorted(set([c for c in codes if c not in SB_TAG_SET]))
|
147 |
+
|
148 |
+
matched_phones = sorted(set([p for p in phones if p in MOBILE_SET]))
|
149 |
+
unmatched_phones = sorted(set([p for p in phones if p not in MOBILE_SET]))
|
150 |
+
|
151 |
+
matched_emails = sorted(set([e for e in emails if e.lower() in EMAIL_SET]))
|
152 |
+
unmatched_emails = sorted(set([e for e in emails if e.lower() not in EMAIL_SET]))
|
153 |
+
|
154 |
return (
|
155 |
lang_text,
|
156 |
transcript,
|
|
|
158 |
", ".join(brands),
|
159 |
", ".join(topics),
|
160 |
key_takeaways,
|
161 |
+
", ".join(negatives),
|
162 |
+
", ".join(matched_codes), ", ".join(unmatched_codes),
|
163 |
+
", ".join(matched_phones), ", ".join(unmatched_phones),
|
164 |
+
", ".join(matched_emails), ", ".join(unmatched_emails)
|
165 |
)
|
166 |
|
167 |
iface = gr.Interface(
|
|
|
174 |
gr.Textbox(label="Indian Brokerages & Fintech Brands Detected"),
|
175 |
gr.Textbox(label="Key Topics"),
|
176 |
gr.Textbox(label="Bulleted Key Takeaways"),
|
177 |
+
gr.Textbox(label="Negative Keywords Detected"),
|
178 |
+
gr.Textbox(label="Matched AP Codes"),
|
179 |
+
gr.Textbox(label="Unmatched AP Codes"),
|
180 |
+
gr.Textbox(label="Matched Phone Numbers"),
|
181 |
+
gr.Textbox(label="Unmatched Phone Numbers"),
|
182 |
+
gr.Textbox(label="Matched Email IDs"),
|
183 |
+
gr.Textbox(label="Unmatched Email IDs"),
|
184 |
],
|
185 |
+
title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
|
186 |
+
description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list."
|
187 |
)
|
188 |
|
189 |
iface.launch()
|