Audiototext / app.py
jaisun2004's picture
Update app.py
6e420f8 verified
import gradio as gr
import openai
from langdetect import detect
from transformers import pipeline
from keybert import KeyBERT
import os
import pandas as pd
import re
# ---- CONSTANTS ----
EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
CHUNK_SIZE = 600
MAX_TRANSCRIPT_WORDS = 3000
openai.api_key = os.getenv("OPENAI_API_KEY")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
kw_model = KeyBERT()
# --- Load Excel ONCE ---
try:
ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
ap_df.columns = [c.strip() for c in ap_df.columns]
except Exception as e:
ap_df = pd.DataFrame()
print(f"Error loading Excel: {e}")
# --- Phone/Email/Codes Normalization ---
def normalize_phone(p):
digits = re.sub(r"\D", "", str(p))
if len(digits) > 10:
digits = digits[-10:]
return digits
def normalize_email(e):
return str(e).strip().lower()
SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna()) if not ap_df.empty else set()
MOBILE_SET = set(normalize_phone(x) for x in ap_df["MOBILE NO."].dropna()) if not ap_df.empty else set()
EMAIL_SET = set(normalize_email(x) for x in ap_df["E Mail id"].dropna()) if not ap_df.empty else set()
BRANDS = [
"Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
"HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
"Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
"PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
"Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential","Motilal","India Infoline","Sherkhan"
]
NEGATIVE_KEYWORDS = [
"Assuring return", "Invest with us and earn", "Profit Share", "Password share",
"Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
"False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
"Impersonation angel broking", "Fraud cheat", "Portfolio Management Service",
"Guarantee return", "Guaranteed return", "Tampered document", "Fake document",
"Forged document", "Promising huge return", "Ponzi Dabba", "Synchronised trade",
"Made huge profit", "Siphon amount", "Strategy During Market at Angel One",
"Account Handling", "Fixed Income from Market", "Weekly Expiry Make Money",
"Imposing Angel One", "Profit Share", "Profit Sharing", "Password Sharing",
"Password Share", "Unauthorized Trade", "Advisory Services", "Fake Avdisory",
"Arrest", "Took Money", "Fraud", "Cheat", "Portfolio Management Services", "PMS",
"Gurantee Return", "Guranteed Return", "Huge Return", "Ponzi", "Dabba",
"Make Huge Profit", "Siphon Amount", "Accout Handling", "Account Handling Services",
"Weekly Expire Make Money", "Account Handling聽", "Account Handle", "huge profit",
"advisor", "advisory", "assured return", "Premium Advice", "Free Advice",
"Free Advisory", "Life time free paid calls", "free paid calls", "paid calls",
"premium advisory", "Get Free Advice", "free calls with accuracy", "Free calls",
"Options Intraday Tips", "Equity call Intraday", "Equity call Intraday & Delivery",
"Equity call Delivery", "Premium advisor", "Gurantee Return Services",
"Guranteed Return Services", "advisor Services", "assured return Services",
"Premium Advice Services", "Free Advice Services", "Free Advisory Services",
"Life time free paid calls Services", "free paid calls Services", "paid calls Services",
"premium advisory Services", "Stock Recommendation", "Amount Doubling",
"Best Trade Level In Nifty, Bank Nifty With Accuracy", "Daily Accurate Calls",
"Earn Profit", "Expert Calls", "Fixe Profit Commitment", "Fixed Return", "For Jackpot Trade",
"Good Profits Daily", "Guaranteed Profit", "Paid Investment Plans", "Jackpot Call",
"Loss & Profit Sharing", "Nifty Bank-Nifty And Stock Option Calls .", "Pay & Get (Amount)",
"Sure Shot Calls", "Tips Provide", "Stock tips", "losses", "stock picks", "Multibagger picks",
"High return on investmentInsider Trading Offer/Scheme", "Advance Fee Fraud", "Pyramid Scheme",
"Boiler Room Scam", "Municipal Securities updates", "Churning offers", "Front Running Amount",
"Wash Trading Amount", "Bear Raiding", "Account Takeover", "Binary Options",
"Unregistered Securities", "High-Yield Investment Program", "Forex Amount", "Smurfing offers",
"Invest Quickly", "Trading account opening offer", "Discount on trading account",
"Bonus on Opening account", "Bull Capturing", "Confirmed Swing Options",
"Get Dividend every month", "Penny Stock recommendation", "Bawaal Stock Dhamaal return",
"From thousand to Crores portfolio", "Multibagger stock tips", "Best Over sold stocks",
"Best Over bought stocks", "High dividend yield stocks", "Future stock recommendation",
"Growth scanners", "Growth Screeners", "Bullish stock recommendation", "Bull stocks recommendation",
"Bearish stock recommendation", "Bear stocks recommendation"
]
NEGATIVE_KEYWORDS = [kw.lower() for kw in NEGATIVE_KEYWORDS]
# ---- HELPERS ----
def extract_brands(text):
found = [brand for brand in BRANDS if brand.lower() in text.lower()]
return found if found else ["None detected"]
def extract_topics(text, top_n=5):
keywords = kw_model.extract_keywords(text, top_n=top_n, stop_words='english')
topics = [kw for kw, score in keywords]
return topics if topics else ["None extracted"]
def extract_negative_keywords(text):
if not text:
return []
text_lc = text.lower()
matches = [kw for kw in NEGATIVE_KEYWORDS if kw in text_lc]
return list(set(matches)) if matches else ["None detected"]
def make_bullets(summary):
sentences = summary.replace("\n", " ").split('. ')
bullets = [f"- {s.strip()}" for s in sentences if s.strip()]
return "\n".join(bullets)
def make_str(val):
try:
if val is None:
return ""
if isinstance(val, (bool, int, float)):
return str(val)
if isinstance(val, list):
return "\n".join([make_str(v) for v in val])
if isinstance(val, dict):
return str(val)
return str(val)
except Exception:
return ""
def extract_phone_numbers(text):
pattern = r'(\+91[\-\s]?)?([6-9]\d{9})'
matches = re.findall(pattern, text)
cleaned = [normalize_phone(m[1]) for m in matches if m[1]]
return list(set(cleaned))
def extract_emails(text):
emails = list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
return [normalize_email(e) for e in emails]
def extract_broker_codes(text):
codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
return list(set(codes))
def summarize_long_text(text, chunk_size=CHUNK_SIZE):
words = text.split()
if len(words) < chunk_size:
try:
result = summarizer(text, max_length=100, min_length=30, do_sample=False)
summary = result[0]["summary_text"] if isinstance(result, list) and "summary_text" in result[0] else str(result)
return summary
except Exception as e:
return f"[Error in summary: {e}]"
# Chunk if too long
chunks = []
current = []
count = 0
for w in words:
current.append(w)
count += 1
if count >= chunk_size:
chunks.append(' '.join(current))
current = []
count = 0
if current:
chunks.append(' '.join(current))
summaries = []
for chunk in chunks:
try:
result = summarizer(chunk, max_length=100, min_length=30, do_sample=False)
summary = result[0]["summary_text"] if isinstance(result, list) and "summary_text" in result[0] else str(result)
summaries.append(summary)
except Exception as e:
summaries.append(f"[Error in summary: {e}]")
return ' '.join(summaries)
# ---- MAIN FUNCTION ----
def process_audio(audio_path):
try:
if not audio_path or not isinstance(audio_path, str):
return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
with open(audio_path, "rb") as audio_file:
transcript = openai.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
transcript = make_str(transcript).strip()
if not transcript or len(transcript) < 10:
return ("No recognizable speech found in audio.", "", "", "", "", "", "", "", "", "", "", "", "")
if len(transcript.split()) > MAX_TRANSCRIPT_WORDS:
return (f"Transcript too long ({len(transcript.split())} words). Please upload shorter audio.", "", "", "", "", "", "", "", "", "", "", "", "")
try:
detected_lang = detect(transcript)
lang_text = {'en': 'English', 'hi': 'Hindi', 'ta': 'Tamil'}.get(detected_lang, detected_lang)
except Exception:
lang_text = "unknown"
transcript_en = transcript
if detected_lang != "en":
try:
with open(audio_path, "rb") as audio_file:
transcript_en = openai.audio.translations.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
transcript_en = make_str(transcript_en).strip()
except Exception as e:
transcript_en = f"Error translating: {e}"
summary = summarize_long_text(transcript_en)
brands = extract_brands(transcript_en)
topics = extract_topics(transcript_en)
key_takeaways = make_bullets(summary)
negatives = extract_negative_keywords(transcript_en)
# Extraction & normalization
phones = extract_phone_numbers(transcript_en)
emails = extract_emails(transcript_en)
codes = extract_broker_codes(transcript_en)
# Phone number matching
matched_phones = sorted([p for p in phones if p in MOBILE_SET])
unmatched_phones = sorted([p for p in phones if p not in MOBILE_SET])
if not matched_phones and not unmatched_phones and phones:
unmatched_phones = phones # Show what was extracted
# Email matching
matched_emails = sorted([e for e in emails if e in EMAIL_SET])
unmatched_emails = sorted([e for e in emails if e not in EMAIL_SET])
if not matched_emails and not unmatched_emails and emails:
unmatched_emails = emails
# Broker code matching
matched_codes = sorted([c for c in codes if c in SB_TAG_SET])
unmatched_codes = sorted([c for c in codes if c not in SB_TAG_SET])
if not matched_codes and not unmatched_codes and codes:
unmatched_codes = codes
return (
lang_text,
transcript,
transcript_en,
", ".join(brands),
", ".join(topics),
key_takeaways,
", ".join(negatives),
", ".join(matched_codes), ", ".join(unmatched_codes),
", ".join(matched_phones), ", ".join(unmatched_phones),
", ".join(matched_emails), ", ".join(unmatched_emails)
)
except Exception as e:
return (f"Error: {e}", "", "", "", "", "", "", "", "", "", "", "", "")
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath", label="Upload MP3/WAV Audio"),
outputs=[
gr.Textbox(label="Detected Language"),
gr.Textbox(label="Original Transcript"),
gr.Textbox(label="English Transcript (if translated)"),
gr.Textbox(label="Indian Brokerages & Fintech Brands Detected"),
gr.Textbox(label="Key Topics"),
gr.Textbox(label="Bulleted Key Takeaways"),
gr.Textbox(label="Negative Keywords Detected"),
gr.Textbox(label="Matched AP Codes"),
gr.Textbox(label="Unmatched AP Codes"),
gr.Textbox(label="Matched Phone Numbers"),
gr.Textbox(label="Unmatched Phone Numbers"),
gr.Textbox(label="Matched Email IDs"),
gr.Textbox(label="Unmatched Email IDs"),
],
title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list."
)
iface.launch()