mitesh001 commited on
Commit
4466935
·
1 Parent(s): d635c79

Increase accuracy for expense and date parsing

Browse files
Files changed (1) hide show
  1. main.py +50 -18
main.py CHANGED
@@ -178,18 +178,43 @@ def detect_store_category(text: str):
178
  return found_stores
179
 
180
  # Function to extract dates and time mentions based on regex patterns
181
- def extract_dates_with_accuracy(text: str, amounts: list):
182
- # Get list of numeric values from amount extraction to exclude
183
  amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
184
 
185
- # Use dateparser with relaxed rules
186
- import dateparser
187
- from dateparser.search import search_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- results = search_dates(text, settings = {
190
- "PREFER_DATES_FROM": "future", # Bias future
191
- "RELATIVE_BASE": datetime.now(), # Anchor to now
192
- "RETURN_AS_TIMEZONE_AWARE": False, # Use naive datetime
 
 
 
 
 
 
193
  })
194
 
195
  time_mentions = []
@@ -199,16 +224,11 @@ def extract_dates_with_accuracy(text: str, amounts: list):
199
  for phrase, date in results:
200
  clean_phrase = phrase.strip().lower()
201
 
202
- # Filter out false positives like '1200'
203
  if clean_phrase in amount_values:
204
  continue
205
-
206
- # Ignore common noise phrases that are not actual dates
207
  if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
208
  continue
209
-
210
- # Optionally: skip pure numbers or short numerics
211
- if re.fullmatch(r"\d{3,4}", clean_phrase):
212
  continue
213
  time_mentions.append(clean_phrase)
214
  parsed_dates.append(date.isoformat())
@@ -241,7 +261,7 @@ def estimate_mood(text):
241
  mood_map = {
242
  "happy": [
243
  "happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
244
- "maza aa gaya", "acha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar"
245
  ],
246
  "sad": [
247
  "sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
@@ -463,6 +483,9 @@ def get_meta_info(text: str):
463
  "year": now.year # 0 to 23
464
  }
465
 
 
 
 
466
  # Function to extract amounts in various currencies from text
467
  def extract_amounts(text: str):
468
  currency_patterns = [
@@ -540,6 +563,15 @@ def extract_amounts(text: str):
540
  continue
541
  try:
542
  number = float(number_str)
 
 
 
 
 
 
 
 
 
543
  key = (number, "INR")
544
  if key not in seen:
545
  seen.add(key)
@@ -605,7 +637,7 @@ def predict_expense_category(text, detected_stores):
605
  "course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
606
  ],
607
  "digital_services": [
608
- "domain", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
609
  ],
610
  "gifts_donations": [
611
  "gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
@@ -883,7 +915,7 @@ async def analyze(input: TextInput):
883
 
884
  # classification = classifier(text, labels)
885
  # Async call to classifier
886
- classification = await asyncio.to_thread(classifier, text, labels)
887
  best_label = classification['labels'][0]
888
 
889
  best_label = label_map.get(best_label, best_label)
 
178
  return found_stores
179
 
180
  # Function to extract dates and time mentions based on regex patterns
181
+ def extract_dates_with_accuracy(text: str, amounts: list = None):
182
+ amounts = amounts or []
183
  amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
184
 
185
+ original_text = text
186
+ text_lower = text.lower()
187
+
188
+ # Step 1: Replace Hinglish phrases with English equivalents (only for parsing)
189
+ hinglish_map = {
190
+ "aaj": "today",
191
+ "kal": "tomorrow", # Assuming future
192
+ "parso": "day after tomorrow",
193
+ "abhi": "now",
194
+ "subah": "morning",
195
+ "shaam": "evening",
196
+ "raat ko": "night",
197
+ "agli baar": "next time",
198
+ "agli hafte": "next week",
199
+ "agli mahine": "next month",
200
+ "iss hafte": "this week",
201
+ "iss mahine": "this month",
202
+ "pichhle hafte": "last week",
203
+ "tareekh": "date",
204
+ "do din baad": "in 2 days",
205
+ "teen din baad": "in 3 days",
206
+ }
207
 
208
+ replaced_text = text_lower
209
+ for h_word, en_word in hinglish_map.items():
210
+ replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text)
211
+
212
+ # Step 2: Parse using dateparser
213
+ results = search_dates(replaced_text, settings={
214
+ "PREFER_DATES_FROM": "future",
215
+ "RELATIVE_BASE": datetime.now(),
216
+ "RETURN_AS_TIMEZONE_AWARE": False,
217
+ "STRICT_PARSING": True,
218
  })
219
 
220
  time_mentions = []
 
224
  for phrase, date in results:
225
  clean_phrase = phrase.strip().lower()
226
 
 
227
  if clean_phrase in amount_values:
228
  continue
 
 
229
  if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
230
  continue
231
+ if re.fullmatch(r"\d{3,4}", clean_phrase): # skip 2025, 1200
 
 
232
  continue
233
  time_mentions.append(clean_phrase)
234
  parsed_dates.append(date.isoformat())
 
261
  mood_map = {
262
  "happy": [
263
  "happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
264
+ "maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha"
265
  ],
266
  "sad": [
267
  "sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
 
483
  "year": now.year # 0 to 23
484
  }
485
 
486
+ def is_year_context(text_snippet):
487
+ return bool(re.search(r"\b(?:jan|feb|march|april|may|june|july|aug|sept|oct|nov|dec|year|in|on|by|for)\b", text_snippet))
488
+
489
  # Function to extract amounts in various currencies from text
490
  def extract_amounts(text: str):
491
  currency_patterns = [
 
563
  continue
564
  try:
565
  number = float(number_str)
566
+
567
+ # Context check for year-like numbers
568
+ if 2020 <= number <= 2100:
569
+ # Check 5-6 words before/after for year clue
570
+ span = match.span(1)
571
+ surrounding = text_lower[max(0, span[0]-30):span[1]+30]
572
+ if is_year_context(surrounding):
573
+ continue # Looks like a year
574
+
575
  key = (number, "INR")
576
  if key not in seen:
577
  seen.add(key)
 
637
  "course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
638
  ],
639
  "digital_services": [
640
+ "domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
641
  ],
642
  "gifts_donations": [
643
  "gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
 
915
 
916
  # classification = classifier(text, labels)
917
  # Async call to classifier
918
+ classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.")
919
  best_label = classification['labels'][0]
920
 
921
  best_label = label_map.get(best_label, best_label)