Spaces:

jaisun2004
/

Audiototext

Running

App Files Files Community

jaisun2004 commited on May 29

Commit

6e420f8

verified ·

1 Parent(s): bf8ca15

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -28

app.py CHANGED Viewed

@@ -9,35 +9,44 @@ import re
 # ---- CONSTANTS ----
 EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
-CHUNK_SIZE = 600  # words per summary chunk, lower for speed
-MAX_TRANSCRIPT_WORDS = 3000  # warn if transcript is very large
-# --- LOAD ONCE AT STARTUP ---
 openai.api_key = os.getenv("OPENAI_API_KEY")
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 kw_model = KeyBERT()
-# Load Excel once
 try:
     ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
     ap_df.columns = [c.strip() for c in ap_df.columns]
-    SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna())
-    MOBILE_SET = set(re.sub(r'\D', '', str(x).strip()) for x in ap_df["MOBILE NO."].dropna())
-    EMAIL_SET = set(str(x).strip().lower() for x in ap_df["E Mail id"].dropna())
 except Exception as e:
-    SB_TAG_SET, MOBILE_SET, EMAIL_SET = set(), set(), set()
     print(f"Error loading Excel: {e}")
 BRANDS = [
     "Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
     "HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
     "Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
     "PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
-    "Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential", "Motilal", "India Infoline"
 ]
 NEGATIVE_KEYWORDS = [
         "Assuring return", "Invest with us and earn", "Profit Share", "Password share",
         "Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
         "False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
@@ -117,17 +126,14 @@ def make_str(val):
         return ""
 def extract_phone_numbers(text):
-    pattern = r'(\+91[\-\s]?)?[6-9]\d{1}[\-\s]?\d{4}[\-\s]?\d{5}\b'
     matches = re.findall(pattern, text)
-    cleaned = []
-    for m in matches:
-        possible = ''.join(re.findall(r'\d', m))
-        if len(possible) >= 10:
-            cleaned.append(possible[-10:])
     return list(set(cleaned))
 def extract_emails(text):
-    return list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
 def extract_broker_codes(text):
     codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
@@ -170,7 +176,6 @@ def process_audio(audio_path):
     try:
         if not audio_path or not isinstance(audio_path, str):
             return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
-        # Transcription
         with open(audio_path, "rb") as audio_file:
             transcript = openai.audio.transcriptions.create(
                 model="whisper-1",
@@ -203,25 +208,34 @@ def process_audio(audio_path):
             except Exception as e:
                 transcript_en = f"Error translating: {e}"
-        # Analysis
         summary = summarize_long_text(transcript_en)
         brands = extract_brands(transcript_en)
         topics = extract_topics(transcript_en)
         key_takeaways = make_bullets(summary)
         negatives = extract_negative_keywords(transcript_en)
-        # Extraction
         phones = extract_phone_numbers(transcript_en)
         emails = extract_emails(transcript_en)
         codes = extract_broker_codes(transcript_en)
-        # Matching (all sets are loaded ONCE at startup)
-        matched_codes = sorted(set([c for c in codes if c in SB_TAG_SET]))
-        unmatched_codes = sorted(set([c for c in codes if c not in SB_TAG_SET]))
-        matched_phones = sorted(set([p for p in phones if p in MOBILE_SET]))
-        unmatched_phones = sorted(set([p for p in phones if p not in MOBILE_SET]))
-        matched_emails = sorted(set([e for e in emails if e.lower() in EMAIL_SET]))
-        unmatched_emails = sorted(set([e for e in emails if e.lower() not in EMAIL_SET]))
         return (
             lang_text,
@@ -257,7 +271,7 @@ iface = gr.Interface(
         gr.Textbox(label="Unmatched Email IDs"),
     ],
     title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
-    description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list. Fast & production-ready."
 )
 iface.launch()

 # ---- CONSTANTS ----
 EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
+CHUNK_SIZE = 600
+MAX_TRANSCRIPT_WORDS = 3000
 openai.api_key = os.getenv("OPENAI_API_KEY")
 summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
 kw_model = KeyBERT()
+# --- Load Excel ONCE ---
 try:
     ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
     ap_df.columns = [c.strip() for c in ap_df.columns]
 except Exception as e:
+    ap_df = pd.DataFrame()
     print(f"Error loading Excel: {e}")
+# --- Phone/Email/Codes Normalization ---
+def normalize_phone(p):
+    digits = re.sub(r"\D", "", str(p))
+    if len(digits) > 10:
+        digits = digits[-10:]
+    return digits
+def normalize_email(e):
+    return str(e).strip().lower()
+SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna()) if not ap_df.empty else set()
+MOBILE_SET = set(normalize_phone(x) for x in ap_df["MOBILE NO."].dropna()) if not ap_df.empty else set()
+EMAIL_SET = set(normalize_email(x) for x in ap_df["E Mail id"].dropna()) if not ap_df.empty else set()
 BRANDS = [
     "Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
     "HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
     "Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
     "PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
+    "Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential","Motilal","India Infoline","Sherkhan"
 ]
 NEGATIVE_KEYWORDS = [
         "Assuring return", "Invest with us and earn", "Profit Share", "Password share",
         "Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
         "False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
         return ""
 def extract_phone_numbers(text):
+    pattern = r'(\+91[\-\s]?)?([6-9]\d{9})'
     matches = re.findall(pattern, text)
+    cleaned = [normalize_phone(m[1]) for m in matches if m[1]]
     return list(set(cleaned))
 def extract_emails(text):
+    emails = list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
+    return [normalize_email(e) for e in emails]
 def extract_broker_codes(text):
     codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
     try:
         if not audio_path or not isinstance(audio_path, str):
             return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
         with open(audio_path, "rb") as audio_file:
             transcript = openai.audio.transcriptions.create(
                 model="whisper-1",
             except Exception as e:
                 transcript_en = f"Error translating: {e}"
         summary = summarize_long_text(transcript_en)
         brands = extract_brands(transcript_en)
         topics = extract_topics(transcript_en)
         key_takeaways = make_bullets(summary)
         negatives = extract_negative_keywords(transcript_en)
+        # Extraction & normalization
         phones = extract_phone_numbers(transcript_en)
         emails = extract_emails(transcript_en)
         codes = extract_broker_codes(transcript_en)
+        # Phone number matching
+        matched_phones = sorted([p for p in phones if p in MOBILE_SET])
+        unmatched_phones = sorted([p for p in phones if p not in MOBILE_SET])
+        if not matched_phones and not unmatched_phones and phones:
+            unmatched_phones = phones  # Show what was extracted
+        # Email matching
+        matched_emails = sorted([e for e in emails if e in EMAIL_SET])
+        unmatched_emails = sorted([e for e in emails if e not in EMAIL_SET])
+        if not matched_emails and not unmatched_emails and emails:
+            unmatched_emails = emails
+        # Broker code matching
+        matched_codes = sorted([c for c in codes if c in SB_TAG_SET])
+        unmatched_codes = sorted([c for c in codes if c not in SB_TAG_SET])
+        if not matched_codes and not unmatched_codes and codes:
+            unmatched_codes = codes
         return (
             lang_text,
         gr.Textbox(label="Unmatched Email IDs"),
     ],
     title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
+    description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list."
 )
 iface.launch()