Spaces:
Running
Running
File size: 12,788 Bytes
8be9a26 9b33471 6bf6de7 8be9a26 effbfea 6e420f8 8be9a26 effbfea 8be9a26 6e420f8 effbfea 6e420f8 effbfea 9b33471 6e420f8 8be9a26 9b33471 cd96b52 9b33471 cd96b52 6e420f8 8be9a26 6bf6de7 effbfea 6bf6de7 9b33471 6bf6de7 effbfea 8be9a26 6bf6de7 effbfea 6bf6de7 8be9a26 9b33471 6e420f8 9b33471 6e420f8 9b33471 6e420f8 9b33471 effbfea 8be9a26 effbfea 8be9a26 effbfea 9f3827c effbfea 9f3827c effbfea 9f3827c effbfea 9f3827c effbfea 9f3827c effbfea 6e420f8 effbfea 6e420f8 effbfea 8be9a26 cd96b52 8be9a26 6bf6de7 9b33471 8be9a26 9b33471 6e420f8 8be9a26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 |
import gradio as gr
import openai
from langdetect import detect
from transformers import pipeline
from keybert import KeyBERT
import os
import pandas as pd
import re
# ---- CONSTANTS ----
EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
CHUNK_SIZE = 600
MAX_TRANSCRIPT_WORDS = 3000
openai.api_key = os.getenv("OPENAI_API_KEY")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
kw_model = KeyBERT()
# --- Load Excel ONCE ---
try:
ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
ap_df.columns = [c.strip() for c in ap_df.columns]
except Exception as e:
ap_df = pd.DataFrame()
print(f"Error loading Excel: {e}")
# --- Phone/Email/Codes Normalization ---
def normalize_phone(p):
digits = re.sub(r"\D", "", str(p))
if len(digits) > 10:
digits = digits[-10:]
return digits
def normalize_email(e):
return str(e).strip().lower()
SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna()) if not ap_df.empty else set()
MOBILE_SET = set(normalize_phone(x) for x in ap_df["MOBILE NO."].dropna()) if not ap_df.empty else set()
EMAIL_SET = set(normalize_email(x) for x in ap_df["E Mail id"].dropna()) if not ap_df.empty else set()
BRANDS = [
"Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
"HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
"Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
"PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
"Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential","Motilal","India Infoline","Sherkhan"
]
NEGATIVE_KEYWORDS = [
"Assuring return", "Invest with us and earn", "Profit Share", "Password share",
"Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
"False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
"Impersonation angel broking", "Fraud cheat", "Portfolio Management Service",
"Guarantee return", "Guaranteed return", "Tampered document", "Fake document",
"Forged document", "Promising huge return", "Ponzi Dabba", "Synchronised trade",
"Made huge profit", "Siphon amount", "Strategy During Market at Angel One",
"Account Handling", "Fixed Income from Market", "Weekly Expiry Make Money",
"Imposing Angel One", "Profit Share", "Profit Sharing", "Password Sharing",
"Password Share", "Unauthorized Trade", "Advisory Services", "Fake Avdisory",
"Arrest", "Took Money", "Fraud", "Cheat", "Portfolio Management Services", "PMS",
"Gurantee Return", "Guranteed Return", "Huge Return", "Ponzi", "Dabba",
"Make Huge Profit", "Siphon Amount", "Accout Handling", "Account Handling Services",
"Weekly Expire Make Money", "Account Handling聽", "Account Handle", "huge profit",
"advisor", "advisory", "assured return", "Premium Advice", "Free Advice",
"Free Advisory", "Life time free paid calls", "free paid calls", "paid calls",
"premium advisory", "Get Free Advice", "free calls with accuracy", "Free calls",
"Options Intraday Tips", "Equity call Intraday", "Equity call Intraday & Delivery",
"Equity call Delivery", "Premium advisor", "Gurantee Return Services",
"Guranteed Return Services", "advisor Services", "assured return Services",
"Premium Advice Services", "Free Advice Services", "Free Advisory Services",
"Life time free paid calls Services", "free paid calls Services", "paid calls Services",
"premium advisory Services", "Stock Recommendation", "Amount Doubling",
"Best Trade Level In Nifty, Bank Nifty With Accuracy", "Daily Accurate Calls",
"Earn Profit", "Expert Calls", "Fixe Profit Commitment", "Fixed Return", "For Jackpot Trade",
"Good Profits Daily", "Guaranteed Profit", "Paid Investment Plans", "Jackpot Call",
"Loss & Profit Sharing", "Nifty Bank-Nifty And Stock Option Calls .", "Pay & Get (Amount)",
"Sure Shot Calls", "Tips Provide", "Stock tips", "losses", "stock picks", "Multibagger picks",
"High return on investmentInsider Trading Offer/Scheme", "Advance Fee Fraud", "Pyramid Scheme",
"Boiler Room Scam", "Municipal Securities updates", "Churning offers", "Front Running Amount",
"Wash Trading Amount", "Bear Raiding", "Account Takeover", "Binary Options",
"Unregistered Securities", "High-Yield Investment Program", "Forex Amount", "Smurfing offers",
"Invest Quickly", "Trading account opening offer", "Discount on trading account",
"Bonus on Opening account", "Bull Capturing", "Confirmed Swing Options",
"Get Dividend every month", "Penny Stock recommendation", "Bawaal Stock Dhamaal return",
"From thousand to Crores portfolio", "Multibagger stock tips", "Best Over sold stocks",
"Best Over bought stocks", "High dividend yield stocks", "Future stock recommendation",
"Growth scanners", "Growth Screeners", "Bullish stock recommendation", "Bull stocks recommendation",
"Bearish stock recommendation", "Bear stocks recommendation"
]
NEGATIVE_KEYWORDS = [kw.lower() for kw in NEGATIVE_KEYWORDS]
# ---- HELPERS ----
def extract_brands(text):
found = [brand for brand in BRANDS if brand.lower() in text.lower()]
return found if found else ["None detected"]
def extract_topics(text, top_n=5):
keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
topics = [kw for kw, score in keywords]
return topics if topics else ["None extracted"]
def extract_negative_keywords(text):
if not text:
return []
text_lc = text.lower()
matches = [kw for kw in NEGATIVE_KEYWORDS if kw in text_lc]
return list(set(matches)) if matches else ["None detected"]
def make_bullets(summary):
sentences = summary.replace("\n", " ").split('. ')
bullets = [f"- {s.strip()}" for s in sentences if s.strip()]
return "\n".join(bullets)
def make_str(val):
try:
if val is None:
return ""
if isinstance(val, (bool, int, float)):
return str(val)
if isinstance(val, list):
return "\n".join([make_str(v) for v in val])
if isinstance(val, dict):
return str(val)
return str(val)
except Exception:
return ""
def extract_phone_numbers(text):
pattern = r'(\+91[\-\s]?)?([6-9]\d{9})'
matches = re.findall(pattern, text)
cleaned = [normalize_phone(m[1]) for m in matches if m[1]]
return list(set(cleaned))
def extract_emails(text):
emails = list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
return [normalize_email(e) for e in emails]
def extract_broker_codes(text):
codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
return list(set(codes))
def summarize_long_text(text, chunk_size=CHUNK_SIZE):
words = text.split()
if len(words) < chunk_size:
try:
result = summarizer(text, max_length=100, min_length=30, do_sample=False)
summary = result[0]["summary_text"] if isinstance(result, list) and "summary_text" in result[0] else str(result)
return summary
except Exception as e:
return f"[Error in summary: {e}]"
# Chunk if too long
chunks = []
current = []
count = 0
for w in words:
current.append(w)
count += 1
if count >= chunk_size:
chunks.append(' '.join(current))
current = []
count = 0
if current:
chunks.append(' '.join(current))
summaries = []
for chunk in chunks:
try:
result = summarizer(chunk, max_length=100, min_length=30, do_sample=False)
summary = result[0]["summary_text"] if isinstance(result, list) and "summary_text" in result[0] else str(result)
summaries.append(summary)
except Exception as e:
summaries.append(f"[Error in summary: {e}]")
return ' '.join(summaries)
# ---- MAIN FUNCTION ----
def process_audio(audio_path):
try:
if not audio_path or not isinstance(audio_path, str):
return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
with open(audio_path, "rb") as audio_file:
transcript = openai.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
transcript = make_str(transcript).strip()
if not transcript or len(transcript) < 10:
return ("No recognizable speech found in audio.", "", "", "", "", "", "", "", "", "", "", "", "")
if len(transcript.split()) > MAX_TRANSCRIPT_WORDS:
return (f"Transcript too long ({len(transcript.split())} words). Please upload shorter audio.", "", "", "", "", "", "", "", "", "", "", "", "")
try:
detected_lang = detect(transcript)
lang_text = {'en': 'English', 'hi': 'Hindi', 'ta': 'Tamil'}.get(detected_lang, detected_lang)
except Exception:
lang_text = "unknown"
transcript_en = transcript
if detected_lang != "en":
try:
with open(audio_path, "rb") as audio_file:
transcript_en = openai.audio.translations.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
transcript_en = make_str(transcript_en).strip()
except Exception as e:
transcript_en = f"Error translating: {e}"
summary = summarize_long_text(transcript_en)
brands = extract_brands(transcript_en)
topics = extract_topics(transcript_en)
key_takeaways = make_bullets(summary)
negatives = extract_negative_keywords(transcript_en)
# Extraction & normalization
phones = extract_phone_numbers(transcript_en)
emails = extract_emails(transcript_en)
codes = extract_broker_codes(transcript_en)
# Phone number matching
matched_phones = sorted([p for p in phones if p in MOBILE_SET])
unmatched_phones = sorted([p for p in phones if p not in MOBILE_SET])
if not matched_phones and not unmatched_phones and phones:
unmatched_phones = phones # Show what was extracted
# Email matching
matched_emails = sorted([e for e in emails if e in EMAIL_SET])
unmatched_emails = sorted([e for e in emails if e not in EMAIL_SET])
if not matched_emails and not unmatched_emails and emails:
unmatched_emails = emails
# Broker code matching
matched_codes = sorted([c for c in codes if c in SB_TAG_SET])
unmatched_codes = sorted([c for c in codes if c not in SB_TAG_SET])
if not matched_codes and not unmatched_codes and codes:
unmatched_codes = codes
return (
lang_text,
transcript,
transcript_en,
", ".join(brands),
", ".join(topics),
key_takeaways,
", ".join(negatives),
", ".join(matched_codes), ", ".join(unmatched_codes),
", ".join(matched_phones), ", ".join(unmatched_phones),
", ".join(matched_emails), ", ".join(unmatched_emails)
)
except Exception as e:
return (f"Error: {e}", "", "", "", "", "", "", "", "", "", "", "", "")
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath", label="Upload MP3/WAV Audio"),
outputs=[
gr.Textbox(label="Detected Language"),
gr.Textbox(label="Original Transcript"),
gr.Textbox(label="English Transcript (if translated)"),
gr.Textbox(label="Indian Brokerages & Fintech Brands Detected"),
gr.Textbox(label="Key Topics"),
gr.Textbox(label="Bulleted Key Takeaways"),
gr.Textbox(label="Negative Keywords Detected"),
gr.Textbox(label="Matched AP Codes"),
gr.Textbox(label="Unmatched AP Codes"),
gr.Textbox(label="Matched Phone Numbers"),
gr.Textbox(label="Unmatched Phone Numbers"),
gr.Textbox(label="Matched Email IDs"),
gr.Textbox(label="Unmatched Email IDs"),
],
title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list."
)
iface.launch()
|