Spaces:
Running
Running
import gradio as gr | |
import openai | |
from langdetect import detect | |
from transformers import pipeline | |
from keybert import KeyBERT | |
import os | |
import pandas as pd | |
import re | |
# ---- CONSTANTS ---- | |
EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx" | |
CHUNK_SIZE = 600 | |
MAX_TRANSCRIPT_WORDS = 3000 | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
kw_model = KeyBERT() | |
# --- Load Excel ONCE --- | |
try: | |
ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl") | |
ap_df.columns = [c.strip() for c in ap_df.columns] | |
except Exception as e: | |
ap_df = pd.DataFrame() | |
print(f"Error loading Excel: {e}") | |
# --- Phone/Email/Codes Normalization --- | |
def normalize_phone(p): | |
digits = re.sub(r"\D", "", str(p)) | |
if len(digits) > 10: | |
digits = digits[-10:] | |
return digits | |
def normalize_email(e): | |
return str(e).strip().lower() | |
SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna()) if not ap_df.empty else set() | |
MOBILE_SET = set(normalize_phone(x) for x in ap_df["MOBILE NO."].dropna()) if not ap_df.empty else set() | |
EMAIL_SET = set(normalize_email(x) for x in ap_df["E Mail id"].dropna()) if not ap_df.empty else set() | |
BRANDS = [ | |
"Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct", | |
"HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit", | |
"Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney", | |
"PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney", | |
"Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential","Motilal","India Infoline","Sherkhan" | |
] | |
NEGATIVE_KEYWORDS = [ | |
"Assuring return", "Invest with us and earn", "Profit Share", "Password share", | |
"Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money", | |
"False promise", "Raid", "Imposing Angel broking", "Impersonation angel one", | |
"Impersonation angel broking", "Fraud cheat", "Portfolio Management Service", | |
"Guarantee return", "Guaranteed return", "Tampered document", "Fake document", | |
"Forged document", "Promising huge return", "Ponzi Dabba", "Synchronised trade", | |
"Made huge profit", "Siphon amount", "Strategy During Market at Angel One", | |
"Account Handling", "Fixed Income from Market", "Weekly Expiry Make Money", | |
"Imposing Angel One", "Profit Share", "Profit Sharing", "Password Sharing", | |
"Password Share", "Unauthorized Trade", "Advisory Services", "Fake Avdisory", | |
"Arrest", "Took Money", "Fraud", "Cheat", "Portfolio Management Services", "PMS", | |
"Gurantee Return", "Guranteed Return", "Huge Return", "Ponzi", "Dabba", | |
"Make Huge Profit", "Siphon Amount", "Accout Handling", "Account Handling Services", | |
"Weekly Expire Make Money", "Account Handling聽", "Account Handle", "huge profit", | |
"advisor", "advisory", "assured return", "Premium Advice", "Free Advice", | |
"Free Advisory", "Life time free paid calls", "free paid calls", "paid calls", | |
"premium advisory", "Get Free Advice", "free calls with accuracy", "Free calls", | |
"Options Intraday Tips", "Equity call Intraday", "Equity call Intraday & Delivery", | |
"Equity call Delivery", "Premium advisor", "Gurantee Return Services", | |
"Guranteed Return Services", "advisor Services", "assured return Services", | |
"Premium Advice Services", "Free Advice Services", "Free Advisory Services", | |
"Life time free paid calls Services", "free paid calls Services", "paid calls Services", | |
"premium advisory Services", "Stock Recommendation", "Amount Doubling", | |
"Best Trade Level In Nifty, Bank Nifty With Accuracy", "Daily Accurate Calls", | |
"Earn Profit", "Expert Calls", "Fixe Profit Commitment", "Fixed Return", "For Jackpot Trade", | |
"Good Profits Daily", "Guaranteed Profit", "Paid Investment Plans", "Jackpot Call", | |
"Loss & Profit Sharing", "Nifty Bank-Nifty And Stock Option Calls .", "Pay & Get (Amount)", | |
"Sure Shot Calls", "Tips Provide", "Stock tips", "losses", "stock picks", "Multibagger picks", | |
"High return on investmentInsider Trading Offer/Scheme", "Advance Fee Fraud", "Pyramid Scheme", | |
"Boiler Room Scam", "Municipal Securities updates", "Churning offers", "Front Running Amount", | |
"Wash Trading Amount", "Bear Raiding", "Account Takeover", "Binary Options", | |
"Unregistered Securities", "High-Yield Investment Program", "Forex Amount", "Smurfing offers", | |
"Invest Quickly", "Trading account opening offer", "Discount on trading account", | |
"Bonus on Opening account", "Bull Capturing", "Confirmed Swing Options", | |
"Get Dividend every month", "Penny Stock recommendation", "Bawaal Stock Dhamaal return", | |
"From thousand to Crores portfolio", "Multibagger stock tips", "Best Over sold stocks", | |
"Best Over bought stocks", "High dividend yield stocks", "Future stock recommendation", | |
"Growth scanners", "Growth Screeners", "Bullish stock recommendation", "Bull stocks recommendation", | |
"Bearish stock recommendation", "Bear stocks recommendation" | |
] | |
NEGATIVE_KEYWORDS = [kw.lower() for kw in NEGATIVE_KEYWORDS] | |
# ---- HELPERS ---- | |
def extract_brands(text): | |
found = [brand for brand in BRANDS if brand.lower() in text.lower()] | |
return found if found else ["None detected"] | |
def extract_topics(text, top_n=5): | |
keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english') | |
topics = [kw for kw, score in keywords] | |
return topics if topics else ["None extracted"] | |
def extract_negative_keywords(text): | |
if not text: | |
return [] | |
text_lc = text.lower() | |
matches = [kw for kw in NEGATIVE_KEYWORDS if kw in text_lc] | |
return list(set(matches)) if matches else ["None detected"] | |
def make_bullets(summary): | |
sentences = summary.replace("\n", " ").split('. ') | |
bullets = [f"- {s.strip()}" for s in sentences if s.strip()] | |
return "\n".join(bullets) | |
def make_str(val): | |
try: | |
if val is None: | |
return "" | |
if isinstance(val, (bool, int, float)): | |
return str(val) | |
if isinstance(val, list): | |
return "\n".join([make_str(v) for v in val]) | |
if isinstance(val, dict): | |
return str(val) | |
return str(val) | |
except Exception: | |
return "" | |
def extract_phone_numbers(text): | |
pattern = r'(\+91[\-\s]?)?([6-9]\d{9})' | |
matches = re.findall(pattern, text) | |
cleaned = [normalize_phone(m[1]) for m in matches if m[1]] | |
return list(set(cleaned)) | |
def extract_emails(text): | |
emails = list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text))) | |
return [normalize_email(e) for e in emails] | |
def extract_broker_codes(text): | |
codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper()) | |
return list(set(codes)) | |
def summarize_long_text(text, chunk_size=CHUNK_SIZE): | |
words = text.split() | |
if len(words) < chunk_size: | |
try: | |
result = summarizer(text, max_length=100, min_length=30, do_sample=False) | |
summary = result[0]["summary_text"] if isinstance(result, list) and "summary_text" in result[0] else str(result) | |
return summary | |
except Exception as e: | |
return f"[Error in summary: {e}]" | |
# Chunk if too long | |
chunks = [] | |
current = [] | |
count = 0 | |
for w in words: | |
current.append(w) | |
count += 1 | |
if count >= chunk_size: | |
chunks.append(' '.join(current)) | |
current = [] | |
count = 0 | |
if current: | |
chunks.append(' '.join(current)) | |
summaries = [] | |
for chunk in chunks: | |
try: | |
result = summarizer(chunk, max_length=100, min_length=30, do_sample=False) | |
summary = result[0]["summary_text"] if isinstance(result, list) and "summary_text" in result[0] else str(result) | |
summaries.append(summary) | |
except Exception as e: | |
summaries.append(f"[Error in summary: {e}]") | |
return ' '.join(summaries) | |
# ---- MAIN FUNCTION ---- | |
def process_audio(audio_path): | |
try: | |
if not audio_path or not isinstance(audio_path, str): | |
return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "") | |
with open(audio_path, "rb") as audio_file: | |
transcript = openai.audio.transcriptions.create( | |
model="whisper-1", | |
file=audio_file, | |
response_format="text" | |
) | |
transcript = make_str(transcript).strip() | |
if not transcript or len(transcript) < 10: | |
return ("No recognizable speech found in audio.", "", "", "", "", "", "", "", "", "", "", "", "") | |
if len(transcript.split()) > MAX_TRANSCRIPT_WORDS: | |
return (f"Transcript too long ({len(transcript.split())} words). Please upload shorter audio.", "", "", "", "", "", "", "", "", "", "", "", "") | |
try: | |
detected_lang = detect(transcript) | |
lang_text = {'en': 'English', 'hi': 'Hindi', 'ta': 'Tamil'}.get(detected_lang, detected_lang) | |
except Exception: | |
lang_text = "unknown" | |
transcript_en = transcript | |
if detected_lang != "en": | |
try: | |
with open(audio_path, "rb") as audio_file: | |
transcript_en = openai.audio.translations.create( | |
model="whisper-1", | |
file=audio_file, | |
response_format="text" | |
) | |
transcript_en = make_str(transcript_en).strip() | |
except Exception as e: | |
transcript_en = f"Error translating: {e}" | |
summary = summarize_long_text(transcript_en) | |
brands = extract_brands(transcript_en) | |
topics = extract_topics(transcript_en) | |
key_takeaways = make_bullets(summary) | |
negatives = extract_negative_keywords(transcript_en) | |
# Extraction & normalization | |
phones = extract_phone_numbers(transcript_en) | |
emails = extract_emails(transcript_en) | |
codes = extract_broker_codes(transcript_en) | |
# Phone number matching | |
matched_phones = sorted([p for p in phones if p in MOBILE_SET]) | |
unmatched_phones = sorted([p for p in phones if p not in MOBILE_SET]) | |
if not matched_phones and not unmatched_phones and phones: | |
unmatched_phones = phones # Show what was extracted | |
# Email matching | |
matched_emails = sorted([e for e in emails if e in EMAIL_SET]) | |
unmatched_emails = sorted([e for e in emails if e not in EMAIL_SET]) | |
if not matched_emails and not unmatched_emails and emails: | |
unmatched_emails = emails | |
# Broker code matching | |
matched_codes = sorted([c for c in codes if c in SB_TAG_SET]) | |
unmatched_codes = sorted([c for c in codes if c not in SB_TAG_SET]) | |
if not matched_codes and not unmatched_codes and codes: | |
unmatched_codes = codes | |
return ( | |
lang_text, | |
transcript, | |
transcript_en, | |
", ".join(brands), | |
", ".join(topics), | |
key_takeaways, | |
", ".join(negatives), | |
", ".join(matched_codes), ", ".join(unmatched_codes), | |
", ".join(matched_phones), ", ".join(unmatched_phones), | |
", ".join(matched_emails), ", ".join(unmatched_emails) | |
) | |
except Exception as e: | |
return (f"Error: {e}", "", "", "", "", "", "", "", "", "", "", "", "") | |
iface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(type="filepath", label="Upload MP3/WAV Audio"), | |
outputs=[ | |
gr.Textbox(label="Detected Language"), | |
gr.Textbox(label="Original Transcript"), | |
gr.Textbox(label="English Transcript (if translated)"), | |
gr.Textbox(label="Indian Brokerages & Fintech Brands Detected"), | |
gr.Textbox(label="Key Topics"), | |
gr.Textbox(label="Bulleted Key Takeaways"), | |
gr.Textbox(label="Negative Keywords Detected"), | |
gr.Textbox(label="Matched AP Codes"), | |
gr.Textbox(label="Unmatched AP Codes"), | |
gr.Textbox(label="Matched Phone Numbers"), | |
gr.Textbox(label="Unmatched Phone Numbers"), | |
gr.Textbox(label="Matched Email IDs"), | |
gr.Textbox(label="Unmatched Email IDs"), | |
], | |
title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)", | |
description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list." | |
) | |
iface.launch() | |