Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,35 +9,44 @@ import re
|
|
9 |
|
10 |
# ---- CONSTANTS ----
|
11 |
EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
|
12 |
-
CHUNK_SIZE = 600
|
13 |
-
MAX_TRANSCRIPT_WORDS = 3000
|
14 |
|
15 |
-
# --- LOAD ONCE AT STARTUP ---
|
16 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
17 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
18 |
kw_model = KeyBERT()
|
19 |
|
20 |
-
# Load Excel
|
21 |
try:
|
22 |
ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
|
23 |
ap_df.columns = [c.strip() for c in ap_df.columns]
|
24 |
-
SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna())
|
25 |
-
MOBILE_SET = set(re.sub(r'\D', '', str(x).strip()) for x in ap_df["MOBILE NO."].dropna())
|
26 |
-
EMAIL_SET = set(str(x).strip().lower() for x in ap_df["E Mail id"].dropna())
|
27 |
except Exception as e:
|
28 |
-
|
29 |
print(f"Error loading Excel: {e}")
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
BRANDS = [
|
32 |
"Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
|
33 |
"HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
|
34 |
"Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
|
35 |
"PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
|
36 |
-
"Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential",
|
37 |
]
|
38 |
|
39 |
NEGATIVE_KEYWORDS = [
|
40 |
-
|
41 |
"Assuring return", "Invest with us and earn", "Profit Share", "Password share",
|
42 |
"Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
|
43 |
"False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
|
@@ -117,17 +126,14 @@ def make_str(val):
|
|
117 |
return ""
|
118 |
|
119 |
def extract_phone_numbers(text):
|
120 |
-
pattern = r'(\+91[\-\s]?)?[6-9]\d{
|
121 |
matches = re.findall(pattern, text)
|
122 |
-
cleaned = []
|
123 |
-
for m in matches:
|
124 |
-
possible = ''.join(re.findall(r'\d', m))
|
125 |
-
if len(possible) >= 10:
|
126 |
-
cleaned.append(possible[-10:])
|
127 |
return list(set(cleaned))
|
128 |
|
129 |
def extract_emails(text):
|
130 |
-
|
|
|
131 |
|
132 |
def extract_broker_codes(text):
|
133 |
codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
|
@@ -170,7 +176,6 @@ def process_audio(audio_path):
|
|
170 |
try:
|
171 |
if not audio_path or not isinstance(audio_path, str):
|
172 |
return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
|
173 |
-
# Transcription
|
174 |
with open(audio_path, "rb") as audio_file:
|
175 |
transcript = openai.audio.transcriptions.create(
|
176 |
model="whisper-1",
|
@@ -203,25 +208,34 @@ def process_audio(audio_path):
|
|
203 |
except Exception as e:
|
204 |
transcript_en = f"Error translating: {e}"
|
205 |
|
206 |
-
# Analysis
|
207 |
summary = summarize_long_text(transcript_en)
|
208 |
brands = extract_brands(transcript_en)
|
209 |
topics = extract_topics(transcript_en)
|
210 |
key_takeaways = make_bullets(summary)
|
211 |
negatives = extract_negative_keywords(transcript_en)
|
212 |
|
213 |
-
# Extraction
|
214 |
phones = extract_phone_numbers(transcript_en)
|
215 |
emails = extract_emails(transcript_en)
|
216 |
codes = extract_broker_codes(transcript_en)
|
217 |
|
218 |
-
#
|
219 |
-
|
220 |
-
|
221 |
-
matched_phones
|
222 |
-
|
223 |
-
|
224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
225 |
|
226 |
return (
|
227 |
lang_text,
|
@@ -257,7 +271,7 @@ iface = gr.Interface(
|
|
257 |
gr.Textbox(label="Unmatched Email IDs"),
|
258 |
],
|
259 |
title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
|
260 |
-
description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list.
|
261 |
)
|
262 |
|
263 |
iface.launch()
|
|
|
9 |
|
10 |
# ---- CONSTANTS ----
|
11 |
EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
|
12 |
+
CHUNK_SIZE = 600
|
13 |
+
MAX_TRANSCRIPT_WORDS = 3000
|
14 |
|
|
|
15 |
openai.api_key = os.getenv("OPENAI_API_KEY")
|
16 |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
|
17 |
kw_model = KeyBERT()
|
18 |
|
19 |
+
# --- Load Excel ONCE ---
|
20 |
try:
|
21 |
ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
|
22 |
ap_df.columns = [c.strip() for c in ap_df.columns]
|
|
|
|
|
|
|
23 |
except Exception as e:
|
24 |
+
ap_df = pd.DataFrame()
|
25 |
print(f"Error loading Excel: {e}")
|
26 |
|
27 |
+
# --- Phone/Email/Codes Normalization ---
|
28 |
+
def normalize_phone(p):
|
29 |
+
digits = re.sub(r"\D", "", str(p))
|
30 |
+
if len(digits) > 10:
|
31 |
+
digits = digits[-10:]
|
32 |
+
return digits
|
33 |
+
|
34 |
+
def normalize_email(e):
|
35 |
+
return str(e).strip().lower()
|
36 |
+
|
37 |
+
SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna()) if not ap_df.empty else set()
|
38 |
+
MOBILE_SET = set(normalize_phone(x) for x in ap_df["MOBILE NO."].dropna()) if not ap_df.empty else set()
|
39 |
+
EMAIL_SET = set(normalize_email(x) for x in ap_df["E Mail id"].dropna()) if not ap_df.empty else set()
|
40 |
+
|
41 |
BRANDS = [
|
42 |
"Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
|
43 |
"HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
|
44 |
"Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
|
45 |
"PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
|
46 |
+
"Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential","Motilal","India Infoline","Sherkhan"
|
47 |
]
|
48 |
|
49 |
NEGATIVE_KEYWORDS = [
|
|
|
50 |
"Assuring return", "Invest with us and earn", "Profit Share", "Password share",
|
51 |
"Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
|
52 |
"False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
|
|
|
126 |
return ""
|
127 |
|
128 |
def extract_phone_numbers(text):
|
129 |
+
pattern = r'(\+91[\-\s]?)?([6-9]\d{9})'
|
130 |
matches = re.findall(pattern, text)
|
131 |
+
cleaned = [normalize_phone(m[1]) for m in matches if m[1]]
|
|
|
|
|
|
|
|
|
132 |
return list(set(cleaned))
|
133 |
|
134 |
def extract_emails(text):
|
135 |
+
emails = list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
|
136 |
+
return [normalize_email(e) for e in emails]
|
137 |
|
138 |
def extract_broker_codes(text):
|
139 |
codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
|
|
|
176 |
try:
|
177 |
if not audio_path or not isinstance(audio_path, str):
|
178 |
return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
|
|
|
179 |
with open(audio_path, "rb") as audio_file:
|
180 |
transcript = openai.audio.transcriptions.create(
|
181 |
model="whisper-1",
|
|
|
208 |
except Exception as e:
|
209 |
transcript_en = f"Error translating: {e}"
|
210 |
|
|
|
211 |
summary = summarize_long_text(transcript_en)
|
212 |
brands = extract_brands(transcript_en)
|
213 |
topics = extract_topics(transcript_en)
|
214 |
key_takeaways = make_bullets(summary)
|
215 |
negatives = extract_negative_keywords(transcript_en)
|
216 |
|
217 |
+
# Extraction & normalization
|
218 |
phones = extract_phone_numbers(transcript_en)
|
219 |
emails = extract_emails(transcript_en)
|
220 |
codes = extract_broker_codes(transcript_en)
|
221 |
|
222 |
+
# Phone number matching
|
223 |
+
matched_phones = sorted([p for p in phones if p in MOBILE_SET])
|
224 |
+
unmatched_phones = sorted([p for p in phones if p not in MOBILE_SET])
|
225 |
+
if not matched_phones and not unmatched_phones and phones:
|
226 |
+
unmatched_phones = phones # Show what was extracted
|
227 |
+
|
228 |
+
# Email matching
|
229 |
+
matched_emails = sorted([e for e in emails if e in EMAIL_SET])
|
230 |
+
unmatched_emails = sorted([e for e in emails if e not in EMAIL_SET])
|
231 |
+
if not matched_emails and not unmatched_emails and emails:
|
232 |
+
unmatched_emails = emails
|
233 |
+
|
234 |
+
# Broker code matching
|
235 |
+
matched_codes = sorted([c for c in codes if c in SB_TAG_SET])
|
236 |
+
unmatched_codes = sorted([c for c in codes if c not in SB_TAG_SET])
|
237 |
+
if not matched_codes and not unmatched_codes and codes:
|
238 |
+
unmatched_codes = codes
|
239 |
|
240 |
return (
|
241 |
lang_text,
|
|
|
271 |
gr.Textbox(label="Unmatched Email IDs"),
|
272 |
],
|
273 |
title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
|
274 |
+
description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list."
|
275 |
)
|
276 |
|
277 |
iface.launch()
|