Spaces:

yourpartner
/

demospace

Running

App Files Files Community

mitesh001 commited on about 1 month ago

Commit

4466935

1 Parent(s): d635c79

Increase accuracy for expense and date parsing

Browse files

Files changed (1) hide show

main.py +50 -18

main.py CHANGED Viewed

@@ -178,18 +178,43 @@ def detect_store_category(text: str):
     return found_stores
 # Function to extract dates and time mentions based on regex patterns
-def extract_dates_with_accuracy(text: str, amounts: list):
-    # Get list of numeric values from amount extraction to exclude
     amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
-    # Use dateparser with relaxed rules
-    import dateparser
-    from dateparser.search import search_dates
-    results = search_dates(text, settings = {
-        "PREFER_DATES_FROM": "future",      # Bias future
-        "RELATIVE_BASE": datetime.now(),    # Anchor to now
-        "RETURN_AS_TIMEZONE_AWARE": False,  # Use naive datetime
     })
     time_mentions = []
@@ -199,16 +224,11 @@ def extract_dates_with_accuracy(text: str, amounts: list):
         for phrase, date in results:
             clean_phrase = phrase.strip().lower()
-            # Filter out false positives like '1200'
             if clean_phrase in amount_values:
                 continue
-            # Ignore common noise phrases that are not actual dates
             if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
                 continue
-            # Optionally: skip pure numbers or short numerics
-            if re.fullmatch(r"\d{3,4}", clean_phrase):
                 continue
             time_mentions.append(clean_phrase)
             parsed_dates.append(date.isoformat())
@@ -241,7 +261,7 @@ def estimate_mood(text):
     mood_map = {
         "happy": [
             "happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
-            "maza aa gaya", "acha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar"
         ],
         "sad": [
             "sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
@@ -463,6 +483,9 @@ def get_meta_info(text: str):
         "year": now.year               # 0 to 23
     }
 # Function to extract amounts in various currencies from text
 def extract_amounts(text: str):
     currency_patterns = [
@@ -540,6 +563,15 @@ def extract_amounts(text: str):
                     continue
                 try:
                     number = float(number_str)
                     key = (number, "INR")
                     if key not in seen:
                         seen.add(key)
@@ -605,7 +637,7 @@ def predict_expense_category(text, detected_stores):
         "course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
     ],
     "digital_services": [
-        "domain", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
     ],
     "gifts_donations": [
         "gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
@@ -883,7 +915,7 @@ async def analyze(input: TextInput):
     # classification = classifier(text, labels)
     # Async call to classifier
-    classification = await asyncio.to_thread(classifier, text, labels)
     best_label = classification['labels'][0]
     best_label = label_map.get(best_label, best_label)

     return found_stores
 # Function to extract dates and time mentions based on regex patterns
+def extract_dates_with_accuracy(text: str, amounts: list = None):
+    amounts = amounts or []
     amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
+    original_text = text
+    text_lower = text.lower()
+    # Step 1: Replace Hinglish phrases with English equivalents (only for parsing)
+    hinglish_map = {
+        "aaj": "today",
+        "kal": "tomorrow",   # Assuming future
+        "parso": "day after tomorrow",
+        "abhi": "now",
+        "subah": "morning",
+        "shaam": "evening",
+        "raat ko": "night",
+        "agli baar": "next time",
+        "agli hafte": "next week",
+        "agli mahine": "next month",
+        "iss hafte": "this week",
+        "iss mahine": "this month",
+        "pichhle hafte": "last week",
+        "tareekh": "date",
+        "do din baad": "in 2 days",
+        "teen din baad": "in 3 days",
+    }
+    replaced_text = text_lower
+    for h_word, en_word in hinglish_map.items():
+        replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text)
+    # Step 2: Parse using dateparser
+    results = search_dates(replaced_text, settings={
+        "PREFER_DATES_FROM": "future",
+        "RELATIVE_BASE": datetime.now(),
+        "RETURN_AS_TIMEZONE_AWARE": False,
+        "STRICT_PARSING": True,
     })
     time_mentions = []
         for phrase, date in results:
             clean_phrase = phrase.strip().lower()
             if clean_phrase in amount_values:
                 continue
             if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
                 continue
+            if re.fullmatch(r"\d{3,4}", clean_phrase):  # skip 2025, 1200
                 continue
             time_mentions.append(clean_phrase)
             parsed_dates.append(date.isoformat())
     mood_map = {
         "happy": [
             "happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
+            "maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha"
         ],
         "sad": [
             "sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
         "year": now.year               # 0 to 23
     }
+def is_year_context(text_snippet):
+    return bool(re.search(r"\b(?:jan|feb|march|april|may|june|july|aug|sept|oct|nov|dec|year|in|on|by|for)\b", text_snippet))
 # Function to extract amounts in various currencies from text
 def extract_amounts(text: str):
     currency_patterns = [
                     continue
                 try:
                     number = float(number_str)
+                    # Context check for year-like numbers
+                    if 2020 <= number <= 2100:
+                        # Check 5-6 words before/after for year clue
+                        span = match.span(1)
+                        surrounding = text_lower[max(0, span[0]-30):span[1]+30]
+                        if is_year_context(surrounding):
+                            continue  # Looks like a year
                     key = (number, "INR")
                     if key not in seen:
                         seen.add(key)
         "course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
     ],
     "digital_services": [
+        "domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
     ],
     "gifts_donations": [
         "gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
     # classification = classifier(text, labels)
     # Async call to classifier
+    classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.")
     best_label = classification['labels'][0]
     best_label = label_map.get(best_label, best_label)