jaisun2004 commited on
Commit
6e420f8
·
verified ·
1 Parent(s): bf8ca15

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -28
app.py CHANGED
@@ -9,35 +9,44 @@ import re
9
 
10
  # ---- CONSTANTS ----
11
  EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
12
- CHUNK_SIZE = 600 # words per summary chunk, lower for speed
13
- MAX_TRANSCRIPT_WORDS = 3000 # warn if transcript is very large
14
 
15
- # --- LOAD ONCE AT STARTUP ---
16
  openai.api_key = os.getenv("OPENAI_API_KEY")
17
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
18
  kw_model = KeyBERT()
19
 
20
- # Load Excel once
21
  try:
22
  ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
23
  ap_df.columns = [c.strip() for c in ap_df.columns]
24
- SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna())
25
- MOBILE_SET = set(re.sub(r'\D', '', str(x).strip()) for x in ap_df["MOBILE NO."].dropna())
26
- EMAIL_SET = set(str(x).strip().lower() for x in ap_df["E Mail id"].dropna())
27
  except Exception as e:
28
- SB_TAG_SET, MOBILE_SET, EMAIL_SET = set(), set(), set()
29
  print(f"Error loading Excel: {e}")
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  BRANDS = [
32
  "Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
33
  "HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
34
  "Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
35
  "PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
36
- "Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential", "Motilal", "India Infoline"
37
  ]
38
 
39
  NEGATIVE_KEYWORDS = [
40
-
41
  "Assuring return", "Invest with us and earn", "Profit Share", "Password share",
42
  "Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
43
  "False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
@@ -117,17 +126,14 @@ def make_str(val):
117
  return ""
118
 
119
  def extract_phone_numbers(text):
120
- pattern = r'(\+91[\-\s]?)?[6-9]\d{1}[\-\s]?\d{4}[\-\s]?\d{5}\b'
121
  matches = re.findall(pattern, text)
122
- cleaned = []
123
- for m in matches:
124
- possible = ''.join(re.findall(r'\d', m))
125
- if len(possible) >= 10:
126
- cleaned.append(possible[-10:])
127
  return list(set(cleaned))
128
 
129
  def extract_emails(text):
130
- return list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
 
131
 
132
  def extract_broker_codes(text):
133
  codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
@@ -170,7 +176,6 @@ def process_audio(audio_path):
170
  try:
171
  if not audio_path or not isinstance(audio_path, str):
172
  return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
173
- # Transcription
174
  with open(audio_path, "rb") as audio_file:
175
  transcript = openai.audio.transcriptions.create(
176
  model="whisper-1",
@@ -203,25 +208,34 @@ def process_audio(audio_path):
203
  except Exception as e:
204
  transcript_en = f"Error translating: {e}"
205
 
206
- # Analysis
207
  summary = summarize_long_text(transcript_en)
208
  brands = extract_brands(transcript_en)
209
  topics = extract_topics(transcript_en)
210
  key_takeaways = make_bullets(summary)
211
  negatives = extract_negative_keywords(transcript_en)
212
 
213
- # Extraction
214
  phones = extract_phone_numbers(transcript_en)
215
  emails = extract_emails(transcript_en)
216
  codes = extract_broker_codes(transcript_en)
217
 
218
- # Matching (all sets are loaded ONCE at startup)
219
- matched_codes = sorted(set([c for c in codes if c in SB_TAG_SET]))
220
- unmatched_codes = sorted(set([c for c in codes if c not in SB_TAG_SET]))
221
- matched_phones = sorted(set([p for p in phones if p in MOBILE_SET]))
222
- unmatched_phones = sorted(set([p for p in phones if p not in MOBILE_SET]))
223
- matched_emails = sorted(set([e for e in emails if e.lower() in EMAIL_SET]))
224
- unmatched_emails = sorted(set([e for e in emails if e.lower() not in EMAIL_SET]))
 
 
 
 
 
 
 
 
 
 
225
 
226
  return (
227
  lang_text,
@@ -257,7 +271,7 @@ iface = gr.Interface(
257
  gr.Textbox(label="Unmatched Email IDs"),
258
  ],
259
  title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
260
- description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list. Fast & production-ready."
261
  )
262
 
263
  iface.launch()
 
9
 
10
  # ---- CONSTANTS ----
11
  EXCEL_PATH = "Active AP list - 05.12.2024 4.xlsx"
12
+ CHUNK_SIZE = 600
13
+ MAX_TRANSCRIPT_WORDS = 3000
14
 
 
15
  openai.api_key = os.getenv("OPENAI_API_KEY")
16
  summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
17
  kw_model = KeyBERT()
18
 
19
+ # --- Load Excel ONCE ---
20
  try:
21
  ap_df = pd.read_excel(EXCEL_PATH, engine="openpyxl")
22
  ap_df.columns = [c.strip() for c in ap_df.columns]
 
 
 
23
  except Exception as e:
24
+ ap_df = pd.DataFrame()
25
  print(f"Error loading Excel: {e}")
26
 
27
+ # --- Phone/Email/Codes Normalization ---
28
+ def normalize_phone(p):
29
+ digits = re.sub(r"\D", "", str(p))
30
+ if len(digits) > 10:
31
+ digits = digits[-10:]
32
+ return digits
33
+
34
+ def normalize_email(e):
35
+ return str(e).strip().lower()
36
+
37
+ SB_TAG_SET = set(str(x).strip().upper() for x in ap_df["SB TAG"].dropna()) if not ap_df.empty else set()
38
+ MOBILE_SET = set(normalize_phone(x) for x in ap_df["MOBILE NO."].dropna()) if not ap_df.empty else set()
39
+ EMAIL_SET = set(normalize_email(x) for x in ap_df["E Mail id"].dropna()) if not ap_df.empty else set()
40
+
41
  BRANDS = [
42
  "Zerodha", "Upstox", "Groww", "Angel One", "Motilal Oswal", "Sharekhan", "5paisa", "ICICI Direct",
43
  "HDFC Securities", "Kotak Securities", "Axis Direct", "IIFL", "Paytm Money", "Edelweiss", "Geojit",
44
  "Fyers", "Alice Blue", "mStock", "Stockal", "Kuvera", "Smallcase", "Jupiter", "Fi", "INDmoney",
45
  "PhonePe", "Paytm", "Google Pay", "BHIM", "MobiKwik", "Cred", "Niyo", "Razorpay", "ETMoney",
46
+ "Bajaj Finserv", "SBI Securities", "YES Securities", "IDFC FIRST", "CAMS", "Karvy", "LIC", "ICICI Prudential","Motilal","India Infoline","Sherkhan"
47
  ]
48
 
49
  NEGATIVE_KEYWORDS = [
 
50
  "Assuring return", "Invest with us and earn", "Profit Share", "Password share",
51
  "Unauthorised trade", "without consent order", "Fake advisor", "Arrest took money",
52
  "False promise", "Raid", "Imposing Angel broking", "Impersonation angel one",
 
126
  return ""
127
 
128
  def extract_phone_numbers(text):
129
+ pattern = r'(\+91[\-\s]?)?([6-9]\d{9})'
130
  matches = re.findall(pattern, text)
131
+ cleaned = [normalize_phone(m[1]) for m in matches if m[1]]
 
 
 
 
132
  return list(set(cleaned))
133
 
134
  def extract_emails(text):
135
+ emails = list(set(re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', text)))
136
+ return [normalize_email(e) for e in emails]
137
 
138
  def extract_broker_codes(text):
139
  codes = re.findall(r'\b([A-Z]{2,4}\d{4,6})\b', text.upper())
 
176
  try:
177
  if not audio_path or not isinstance(audio_path, str):
178
  return ("No audio file provided.", "", "", "", "", "", "", "", "", "", "", "", "")
 
179
  with open(audio_path, "rb") as audio_file:
180
  transcript = openai.audio.transcriptions.create(
181
  model="whisper-1",
 
208
  except Exception as e:
209
  transcript_en = f"Error translating: {e}"
210
 
 
211
  summary = summarize_long_text(transcript_en)
212
  brands = extract_brands(transcript_en)
213
  topics = extract_topics(transcript_en)
214
  key_takeaways = make_bullets(summary)
215
  negatives = extract_negative_keywords(transcript_en)
216
 
217
+ # Extraction & normalization
218
  phones = extract_phone_numbers(transcript_en)
219
  emails = extract_emails(transcript_en)
220
  codes = extract_broker_codes(transcript_en)
221
 
222
+ # Phone number matching
223
+ matched_phones = sorted([p for p in phones if p in MOBILE_SET])
224
+ unmatched_phones = sorted([p for p in phones if p not in MOBILE_SET])
225
+ if not matched_phones and not unmatched_phones and phones:
226
+ unmatched_phones = phones # Show what was extracted
227
+
228
+ # Email matching
229
+ matched_emails = sorted([e for e in emails if e in EMAIL_SET])
230
+ unmatched_emails = sorted([e for e in emails if e not in EMAIL_SET])
231
+ if not matched_emails and not unmatched_emails and emails:
232
+ unmatched_emails = emails
233
+
234
+ # Broker code matching
235
+ matched_codes = sorted([c for c in codes if c in SB_TAG_SET])
236
+ unmatched_codes = sorted([c for c in codes if c not in SB_TAG_SET])
237
+ if not matched_codes and not unmatched_codes and codes:
238
+ unmatched_codes = codes
239
 
240
  return (
241
  lang_text,
 
271
  gr.Textbox(label="Unmatched Email IDs"),
272
  ],
273
  title="Audio Brand, Scam Keyword & AP/Contact Crossmatch (Indian Brokerage)",
274
+ description="Upload audio (MP3/WAV). Extract transcript, summary, Indian brokerage brands, scam keywords, and crossmatch broker codes, phone numbers, emails with the official AP Excel list."
275
  )
276
 
277
  iface.launch()