Spaces:
Running
Running
Increase accuracy for expense and date parsing
Browse files
main.py
CHANGED
@@ -178,18 +178,43 @@ def detect_store_category(text: str):
|
|
178 |
return found_stores
|
179 |
|
180 |
# Function to extract dates and time mentions based on regex patterns
|
181 |
-
def extract_dates_with_accuracy(text: str, amounts: list):
|
182 |
-
|
183 |
amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
})
|
194 |
|
195 |
time_mentions = []
|
@@ -199,16 +224,11 @@ def extract_dates_with_accuracy(text: str, amounts: list):
|
|
199 |
for phrase, date in results:
|
200 |
clean_phrase = phrase.strip().lower()
|
201 |
|
202 |
-
# Filter out false positives like '1200'
|
203 |
if clean_phrase in amount_values:
|
204 |
continue
|
205 |
-
|
206 |
-
# Ignore common noise phrases that are not actual dates
|
207 |
if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
|
208 |
continue
|
209 |
-
|
210 |
-
# Optionally: skip pure numbers or short numerics
|
211 |
-
if re.fullmatch(r"\d{3,4}", clean_phrase):
|
212 |
continue
|
213 |
time_mentions.append(clean_phrase)
|
214 |
parsed_dates.append(date.isoformat())
|
@@ -241,7 +261,7 @@ def estimate_mood(text):
|
|
241 |
mood_map = {
|
242 |
"happy": [
|
243 |
"happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
|
244 |
-
"maza aa gaya", "
|
245 |
],
|
246 |
"sad": [
|
247 |
"sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
|
@@ -463,6 +483,9 @@ def get_meta_info(text: str):
|
|
463 |
"year": now.year # 0 to 23
|
464 |
}
|
465 |
|
|
|
|
|
|
|
466 |
# Function to extract amounts in various currencies from text
|
467 |
def extract_amounts(text: str):
|
468 |
currency_patterns = [
|
@@ -540,6 +563,15 @@ def extract_amounts(text: str):
|
|
540 |
continue
|
541 |
try:
|
542 |
number = float(number_str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
key = (number, "INR")
|
544 |
if key not in seen:
|
545 |
seen.add(key)
|
@@ -605,7 +637,7 @@ def predict_expense_category(text, detected_stores):
|
|
605 |
"course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
|
606 |
],
|
607 |
"digital_services": [
|
608 |
-
"domain", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
|
609 |
],
|
610 |
"gifts_donations": [
|
611 |
"gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
|
@@ -883,7 +915,7 @@ async def analyze(input: TextInput):
|
|
883 |
|
884 |
# classification = classifier(text, labels)
|
885 |
# Async call to classifier
|
886 |
-
classification = await asyncio.to_thread(classifier, text, labels)
|
887 |
best_label = classification['labels'][0]
|
888 |
|
889 |
best_label = label_map.get(best_label, best_label)
|
|
|
178 |
return found_stores
|
179 |
|
180 |
# Function to extract dates and time mentions based on regex patterns
|
181 |
+
def extract_dates_with_accuracy(text: str, amounts: list = None):
|
182 |
+
amounts = amounts or []
|
183 |
amount_values = {str(int(a["value"])) for a in amounts if isinstance(a["value"], (int, float))}
|
184 |
|
185 |
+
original_text = text
|
186 |
+
text_lower = text.lower()
|
187 |
+
|
188 |
+
# Step 1: Replace Hinglish phrases with English equivalents (only for parsing)
|
189 |
+
hinglish_map = {
|
190 |
+
"aaj": "today",
|
191 |
+
"kal": "tomorrow", # Assuming future
|
192 |
+
"parso": "day after tomorrow",
|
193 |
+
"abhi": "now",
|
194 |
+
"subah": "morning",
|
195 |
+
"shaam": "evening",
|
196 |
+
"raat ko": "night",
|
197 |
+
"agli baar": "next time",
|
198 |
+
"agli hafte": "next week",
|
199 |
+
"agli mahine": "next month",
|
200 |
+
"iss hafte": "this week",
|
201 |
+
"iss mahine": "this month",
|
202 |
+
"pichhle hafte": "last week",
|
203 |
+
"tareekh": "date",
|
204 |
+
"do din baad": "in 2 days",
|
205 |
+
"teen din baad": "in 3 days",
|
206 |
+
}
|
207 |
|
208 |
+
replaced_text = text_lower
|
209 |
+
for h_word, en_word in hinglish_map.items():
|
210 |
+
replaced_text = re.sub(rf"\b{re.escape(h_word)}\b", en_word, replaced_text)
|
211 |
+
|
212 |
+
# Step 2: Parse using dateparser
|
213 |
+
results = search_dates(replaced_text, settings={
|
214 |
+
"PREFER_DATES_FROM": "future",
|
215 |
+
"RELATIVE_BASE": datetime.now(),
|
216 |
+
"RETURN_AS_TIMEZONE_AWARE": False,
|
217 |
+
"STRICT_PARSING": True,
|
218 |
})
|
219 |
|
220 |
time_mentions = []
|
|
|
224 |
for phrase, date in results:
|
225 |
clean_phrase = phrase.strip().lower()
|
226 |
|
|
|
227 |
if clean_phrase in amount_values:
|
228 |
continue
|
|
|
|
|
229 |
if clean_phrase in {"on", "at", "in", "by", "to", "of"}:
|
230 |
continue
|
231 |
+
if re.fullmatch(r"\d{3,4}", clean_phrase): # skip 2025, 1200
|
|
|
|
|
232 |
continue
|
233 |
time_mentions.append(clean_phrase)
|
234 |
parsed_dates.append(date.isoformat())
|
|
|
261 |
mood_map = {
|
262 |
"happy": [
|
263 |
"happy", "excited", "good", "joy", "grateful", "glad", "pleased", "content", "satisfied", "cheerful", "elated",
|
264 |
+
"maza aa gaya", "achha lag raha hai", "khush", "khushi", "badiya", "mast", "enjoy", "enjoyed", "mazedaar", "achha"
|
265 |
],
|
266 |
"sad": [
|
267 |
"sad", "upset", "crying", "lonely", "depressed", "down", "disappointed", "heartbroken", "unhappy",
|
|
|
483 |
"year": now.year # 0 to 23
|
484 |
}
|
485 |
|
486 |
+
def is_year_context(text_snippet):
|
487 |
+
return bool(re.search(r"\b(?:jan|feb|march|april|may|june|july|aug|sept|oct|nov|dec|year|in|on|by|for)\b", text_snippet))
|
488 |
+
|
489 |
# Function to extract amounts in various currencies from text
|
490 |
def extract_amounts(text: str):
|
491 |
currency_patterns = [
|
|
|
563 |
continue
|
564 |
try:
|
565 |
number = float(number_str)
|
566 |
+
|
567 |
+
# Context check for year-like numbers
|
568 |
+
if 2020 <= number <= 2100:
|
569 |
+
# Check 5-6 words before/after for year clue
|
570 |
+
span = match.span(1)
|
571 |
+
surrounding = text_lower[max(0, span[0]-30):span[1]+30]
|
572 |
+
if is_year_context(surrounding):
|
573 |
+
continue # Looks like a year
|
574 |
+
|
575 |
key = (number, "INR")
|
576 |
if key not in seen:
|
577 |
seen.add(key)
|
|
|
637 |
"course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill", "padhai", "school", "college", "tuition", "kitab", "book", "fees", "shiksha"
|
638 |
],
|
639 |
"digital_services": [
|
640 |
+
"domain", "membership", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas", "subscription", "digital", "online", "app", "service", "renewal"
|
641 |
],
|
642 |
"gifts_donations": [
|
643 |
"gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift", "uphaar", "daan", "tohfa", "chanda", "puja", "mandir", "gurudwara"
|
|
|
915 |
|
916 |
# classification = classifier(text, labels)
|
917 |
# Async call to classifier
|
918 |
+
classification = await asyncio.to_thread(classifier, text, labels, hypothesis_template="This entry is about {}.")
|
919 |
best_label = classification['labels'][0]
|
920 |
|
921 |
best_label = label_map.get(best_label, best_label)
|