Mitesh Koshiya commited on
Commit
e6976d6
·
1 Parent(s): 65c85da

Update expense specific things

Browse files
Files changed (1) hide show
  1. main.py +129 -42
main.py CHANGED
@@ -24,6 +24,7 @@ import psycopg2
24
  from psycopg2.extras import Json
25
  import os
26
  from dotenv import load_dotenv
 
27
 
28
  # Load environment variables
29
  load_dotenv()
@@ -42,6 +43,9 @@ app.add_middleware(
42
  CREATE_TABLE_QUERY = """
43
  CREATE TABLE IF NOT EXISTS user_entries (
44
  uuid UUID PRIMARY KEY,
 
 
 
45
  raw_text TEXT,
46
  word_count INT,
47
  day_of_week TEXT,
@@ -49,6 +53,7 @@ CREATE TABLE IF NOT EXISTS user_entries (
49
  month TEXT,
50
  year INT,
51
  type TEXT,
 
52
  intent TEXT,
53
  confidence_scores JSONB,
54
  urgency_score INT,
@@ -66,6 +71,7 @@ CREATE TABLE IF NOT EXISTS user_entries (
66
  amounts JSONB,
67
  stores TEXT[],
68
  processing_time_ms INT,
 
69
  created_at TIMESTAMPTZ DEFAULT now()
70
  );
71
  """
@@ -419,22 +425,26 @@ def get_meta_info(text: str):
419
  # Function to extract amounts in various currencies from text
420
  def extract_amounts(text: str):
421
  currency_patterns = [
422
- # ₹5000, Rs. 1200, INR 300
423
- (r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)", "INR"),
424
- # $250.75 or 250.75$
425
- (r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)", "USD"),
426
- (r"(\d[\d,]*(?:\.\d+)?)\s?\$", "USD"),
427
- # €100 or 100€
428
- (r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)", "EUR"),
429
- (r"(\d[\d,]*(?:\.\d+)?)\s?€", "EUR"),
430
- # Word-based currency
431
- (r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)", "INR"),
432
- (r"(\d+(?:\.\d+)?)\s?(dollars?)", "USD"),
433
- (r"(\d+(?:\.\d+)?)\s?(euros?)", "EUR"),
434
- (r"(\d+(?:\.\d+)?)\s?(cents?)", "USD"),
435
- # Indian system
436
- (r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)", "INR"),
437
- (r"(\d+(?:\.\d+)?)\s?(crores?|cr)", "INR"),
 
 
 
 
438
  ]
439
 
440
  results = []
@@ -442,19 +452,25 @@ def extract_amounts(text: str):
442
  text_lower = text.lower()
443
 
444
  for pattern, currency_code in currency_patterns:
445
- for match in re.finditer(pattern, text_lower):
446
  groups = match.groups()
447
  raw_number = next((g for g in groups if re.match(r"\d", g)), None)
448
  if not raw_number:
449
  continue
 
 
 
450
  try:
451
  number = float(raw_number.replace(",", ""))
452
- if any(word in groups for word in ['lakh', 'lacs', 'lakhs']):
 
 
453
  number *= 100_000
454
- elif any(word in groups for word in ['crore', 'crores', 'cr']):
455
  number *= 10_000_000
456
- elif 'cents' in groups:
457
  number /= 100
 
458
  except Exception:
459
  continue
460
 
@@ -466,24 +482,78 @@ def extract_amounts(text: str):
466
  "currency": currency_code
467
  })
468
 
469
- # Fallback: detect simple numeric amounts like "paid 500 for rent"
470
  if not results:
471
- match = re.search(r"\b(?:paid|spent|buy|purchase|cost|price)\b.*?(\d{2,8})", text_lower)
472
- if match:
473
- try:
474
- number = float(match.group(1).replace(",", ""))
475
- key = (number, "INR")
476
- if key not in seen:
477
- results.append({
478
- "value": round(number, 2),
479
- "currency": "INR"
480
- })
481
- except:
482
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
483
 
484
  return results
485
 
486
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  def insert_text_entry(data):
488
  try:
489
  conn = psycopg2.connect(DATABASE_URL)
@@ -491,17 +561,17 @@ def insert_text_entry(data):
491
 
492
  insert_query = """
493
  INSERT INTO user_entries (
494
- uuid, raw_text, word_count, day_of_week, hour_of_day, month, year,
495
- type, intent, confidence_scores, urgency_score,
496
  time_mentions, parsed_dates, tense, summary,
497
  people, mood, language, sentiment_score, tags,
498
- action_required, entities, amounts, stores, processing_time_ms
499
  ) VALUES (
500
- %(uuid)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
501
- %(type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
502
  %(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
503
  %(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
504
- %(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s
505
  )
506
  ON CONFLICT (uuid) DO NOTHING;
507
  """
@@ -511,7 +581,8 @@ def insert_text_entry(data):
511
  "confidence_scores": Json(data["confidence_scores"]),
512
  "language": Json(data["language"]),
513
  "entities": Json(data["entities"]),
514
- "amounts": Json(data["amounts"])
 
515
  })
516
 
517
  conn.commit()
@@ -576,10 +647,12 @@ async def analyze(input: TextInput):
576
 
577
  scores = dict(zip(classification['labels'], classification['scores']))
578
  # # Convert to short labels
579
- confidence_scores = {
580
  label_map.get(label, label): score
581
  for label, score in scores.items()
582
  }
 
 
583
 
584
  amounts = await asyncio.to_thread(extract_amounts, text)
585
  parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
@@ -594,6 +667,9 @@ async def analyze(input: TextInput):
594
  intent = infer_intent(best_label, text)
595
  urgency_score = get_urgency_score(text, parsed_dates)
596
  detected_stores = detect_store_category(text)
 
 
 
597
 
598
  # Define action triggers
599
  ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
@@ -608,6 +684,7 @@ async def analyze(input: TextInput):
608
 
609
  result = {
610
  "uuid": str(uuid.uuid4()), # Unique identifier for the request
 
611
  "raw_text": text,
612
  "word_count": meta["word_count"],
613
  "day_of_week": meta["day_of_week"],
@@ -615,6 +692,7 @@ async def analyze(input: TextInput):
615
  "month": meta["month"],
616
  "year": meta["year"],
617
  "type": best_label,
 
618
  "intent": intent,
619
  "confidence_scores": confidence_scores,
620
  "urgency_score": urgency_score,
@@ -633,12 +711,21 @@ async def analyze(input: TextInput):
633
  "stores": detected_stores,
634
  "processing_time_ms": processing_time_ms
635
  }
 
 
 
 
 
 
636
 
637
  # Insert into database
638
  await asyncio.to_thread(insert_text_entry, result)
639
 
640
  # Log the result
641
- print("✅ Analysis complete:", result)
 
 
 
642
 
643
  # Return the result as JSON response
644
  return ORJSONResponse(content=result)
 
24
  from psycopg2.extras import Json
25
  import os
26
  from dotenv import load_dotenv
27
+ import random
28
 
29
  # Load environment variables
30
  load_dotenv()
 
43
  CREATE_TABLE_QUERY = """
44
  CREATE TABLE IF NOT EXISTS user_entries (
45
  uuid UUID PRIMARY KEY,
46
+ user_id TEXT,
47
+ user_name TEXT,
48
+ uese_email TEXT,
49
  raw_text TEXT,
50
  word_count INT,
51
  day_of_week TEXT,
 
53
  month TEXT,
54
  year INT,
55
  type TEXT,
56
+ expense_type TEXT,
57
  intent TEXT,
58
  confidence_scores JSONB,
59
  urgency_score INT,
 
71
  amounts JSONB,
72
  stores TEXT[],
73
  processing_time_ms INT,
74
+ raw_json JSONB,
75
  created_at TIMESTAMPTZ DEFAULT now()
76
  );
77
  """
 
425
  # Function to extract amounts in various currencies from text
426
  def extract_amounts(text: str):
427
  currency_patterns = [
428
+ # INR variants
429
+ (re.compile(r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"),
430
+ (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:₹|rs\.?|inr)"), "INR"),
431
+ (re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)"), "INR"),
432
+ # USD variants
433
+ (re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"),
434
+ (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"),
435
+ (re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"),
436
+ (re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"),
437
+ # EUR variants
438
+ (re.compile(r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"),
439
+ (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?€"), "EUR"),
440
+ (re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"),
441
+ # GBP variants
442
+ (re.compile(r"(?:£|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"),
443
+ (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?£"), "GBP"),
444
+ (re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"),
445
+ # INR large units
446
+ (re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)"), "INR"),
447
+ (re.compile(r"(\d+(?:\.\d+)?)\s?(crores?|crs?|cr)"), "INR"),
448
  ]
449
 
450
  results = []
 
452
  text_lower = text.lower()
453
 
454
  for pattern, currency_code in currency_patterns:
455
+ for match in pattern.finditer(text_lower):
456
  groups = match.groups()
457
  raw_number = next((g for g in groups if re.match(r"\d", g)), None)
458
  if not raw_number:
459
  continue
460
+ # Ignore phone numbers and IDs (10+ digits)
461
+ if len(raw_number.replace(",", "")) >= 10:
462
+ continue
463
  try:
464
  number = float(raw_number.replace(",", ""))
465
+
466
+ # Check for lakh/crore/cents multipliers
467
+ if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups):
468
  number *= 100_000
469
+ elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups):
470
  number *= 10_000_000
471
+ elif any(g == 'cents' for g in groups):
472
  number /= 100
473
+
474
  except Exception:
475
  continue
476
 
 
482
  "currency": currency_code
483
  })
484
 
485
+ # Fallback matching for generic numeric phrases near expense keywords
486
  if not results:
487
+ fallback_patterns = [
488
+ re.compile(
489
+ r"\b(?:paid|spent|buy|purchase|cost|price|add(?:ed)?|gift(?:ed)?|bill(?: of)?|recharge(?:d)?|charged|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)"
490
+ ),
491
+ re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs|inr)?")
492
+ ]
493
+ for fallback_pattern in fallback_patterns:
494
+ match = fallback_pattern.search(text_lower)
495
+ if match:
496
+ number_str = match.group(1).replace(",", "")
497
+ # Ignore phone numbers and IDs
498
+ if len(number_str) >= 10:
499
+ continue
500
+ try:
501
+ number = float(number_str)
502
+ key = (number, "INR")
503
+ if key not in seen:
504
+ seen.add(key)
505
+ results.append({
506
+ "value": round(number, 2),
507
+ "currency": "INR"
508
+ })
509
+ break # Only extract first match in fallback
510
+ except:
511
+ continue
512
 
513
  return results
514
 
515
 
516
+ def predict_expense_category(text, detected_stores):
517
+ text_lower = text.lower()
518
+
519
+ # 1. Use detected store category if available
520
+ if detected_stores:
521
+ best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0))
522
+ return best_match["category"]
523
+
524
+ # Category keyword mapping
525
+ category_keywords = {
526
+ "food": ["food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald"],
527
+ "transport": ["uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto"],
528
+ "shopping": ["amazon", "flipkart", "myntra", "shopping", "clothes", "apparel", "shoes", "jeans", "tshirt", "store", "fashion"],
529
+ "housing": ["rent", "apartment", "house", "flat", "maintenance", "landlord"],
530
+ "utilities": ["electricity", "power", "water", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio"],
531
+ "entertainment": ["movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium"],
532
+ "health": ["medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup"],
533
+ "travel": ["trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra"],
534
+ "education": ["course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill"],
535
+ "digital_services": ["domain", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas"],
536
+ "gifts_donations": ["gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift"],
537
+ "finance": ["insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto"],
538
+ "family_kids": ["kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche"],
539
+ "stationery": [
540
+ "pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery",
541
+ "register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box",
542
+ "stapler", "ink", "printer paper", "stationary shop", "stationary"
543
+ ]
544
+ }
545
+
546
+ # 2. Match using keyword scores
547
+ matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()}
548
+ best_match = max(matched.items(), key=lambda x: x[1])
549
+
550
+ if best_match[1] > 0:
551
+ return best_match[0]
552
+
553
+ return "miscellaneous"
554
+
555
+
556
+
557
  def insert_text_entry(data):
558
  try:
559
  conn = psycopg2.connect(DATABASE_URL)
 
561
 
562
  insert_query = """
563
  INSERT INTO user_entries (
564
+ uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year,
565
+ type, expense_type, intent, confidence_scores, urgency_score,
566
  time_mentions, parsed_dates, tense, summary,
567
  people, mood, language, sentiment_score, tags,
568
+ action_required, entities, amounts, stores, processing_time_ms, raw_json
569
  ) VALUES (
570
+ %(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
571
+ %(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
572
  %(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
573
  %(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
574
+ %(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s
575
  )
576
  ON CONFLICT (uuid) DO NOTHING;
577
  """
 
581
  "confidence_scores": Json(data["confidence_scores"]),
582
  "language": Json(data["language"]),
583
  "entities": Json(data["entities"]),
584
+ "amounts": Json(data["amounts"]),
585
+ "raw_json": Json(data["raw_json"])
586
  })
587
 
588
  conn.commit()
 
647
 
648
  scores = dict(zip(classification['labels'], classification['scores']))
649
  # # Convert to short labels
650
+ confidence_scores_full = {
651
  label_map.get(label, label): score
652
  for label, score in scores.items()
653
  }
654
+ # Only keep top 2
655
+ confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2])
656
 
657
  amounts = await asyncio.to_thread(extract_amounts, text)
658
  parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
 
667
  intent = infer_intent(best_label, text)
668
  urgency_score = get_urgency_score(text, parsed_dates)
669
  detected_stores = detect_store_category(text)
670
+ expense_category = ""
671
+ if best_label == "expense":
672
+ expense_category = predict_expense_category(text, detected_stores)
673
 
674
  # Define action triggers
675
  ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
 
684
 
685
  result = {
686
  "uuid": str(uuid.uuid4()), # Unique identifier for the request
687
+ "user_id": random.randint(1, 10), # Unique identifier for the request
688
  "raw_text": text,
689
  "word_count": meta["word_count"],
690
  "day_of_week": meta["day_of_week"],
 
692
  "month": meta["month"],
693
  "year": meta["year"],
694
  "type": best_label,
695
+ "expense_type": expense_category,
696
  "intent": intent,
697
  "confidence_scores": confidence_scores,
698
  "urgency_score": urgency_score,
 
711
  "stores": detected_stores,
712
  "processing_time_ms": processing_time_ms
713
  }
714
+
715
+ # Store a copy of result without raw_json to avoid circular reference
716
+ raw_json_copy = result.copy()
717
+ # Remove raw_json if present (shouldn't be, but for safety)
718
+ raw_json_copy.pop("raw_json", None)
719
+ result["raw_json"] = raw_json_copy
720
 
721
  # Insert into database
722
  await asyncio.to_thread(insert_text_entry, result)
723
 
724
  # Log the result
725
+ print("✅ Analysis complete")
726
+
727
+ # Remove raw_json from response
728
+ result.pop("raw_json", None)
729
 
730
  # Return the result as JSON response
731
  return ORJSONResponse(content=result)