Spaces:
Running
Running
Mitesh Koshiya
commited on
Commit
·
e6976d6
1
Parent(s):
65c85da
Update expense specific things
Browse files
main.py
CHANGED
@@ -24,6 +24,7 @@ import psycopg2
|
|
24 |
from psycopg2.extras import Json
|
25 |
import os
|
26 |
from dotenv import load_dotenv
|
|
|
27 |
|
28 |
# Load environment variables
|
29 |
load_dotenv()
|
@@ -42,6 +43,9 @@ app.add_middleware(
|
|
42 |
CREATE_TABLE_QUERY = """
|
43 |
CREATE TABLE IF NOT EXISTS user_entries (
|
44 |
uuid UUID PRIMARY KEY,
|
|
|
|
|
|
|
45 |
raw_text TEXT,
|
46 |
word_count INT,
|
47 |
day_of_week TEXT,
|
@@ -49,6 +53,7 @@ CREATE TABLE IF NOT EXISTS user_entries (
|
|
49 |
month TEXT,
|
50 |
year INT,
|
51 |
type TEXT,
|
|
|
52 |
intent TEXT,
|
53 |
confidence_scores JSONB,
|
54 |
urgency_score INT,
|
@@ -66,6 +71,7 @@ CREATE TABLE IF NOT EXISTS user_entries (
|
|
66 |
amounts JSONB,
|
67 |
stores TEXT[],
|
68 |
processing_time_ms INT,
|
|
|
69 |
created_at TIMESTAMPTZ DEFAULT now()
|
70 |
);
|
71 |
"""
|
@@ -419,22 +425,26 @@ def get_meta_info(text: str):
|
|
419 |
# Function to extract amounts in various currencies from text
|
420 |
def extract_amounts(text: str):
|
421 |
currency_patterns = [
|
422 |
-
#
|
423 |
-
(r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)", "INR"),
|
424 |
-
|
425 |
-
(r"(
|
426 |
-
|
427 |
-
|
428 |
-
(r"(
|
429 |
-
(r"(\d
|
430 |
-
|
431 |
-
|
432 |
-
(r"(\d
|
433 |
-
(r"(\d
|
434 |
-
(r"(\d+(?:\.\d+)?)\s?(
|
435 |
-
#
|
436 |
-
(r"(\d
|
437 |
-
(r"(\d
|
|
|
|
|
|
|
|
|
438 |
]
|
439 |
|
440 |
results = []
|
@@ -442,19 +452,25 @@ def extract_amounts(text: str):
|
|
442 |
text_lower = text.lower()
|
443 |
|
444 |
for pattern, currency_code in currency_patterns:
|
445 |
-
for match in
|
446 |
groups = match.groups()
|
447 |
raw_number = next((g for g in groups if re.match(r"\d", g)), None)
|
448 |
if not raw_number:
|
449 |
continue
|
|
|
|
|
|
|
450 |
try:
|
451 |
number = float(raw_number.replace(",", ""))
|
452 |
-
|
|
|
|
|
453 |
number *= 100_000
|
454 |
-
elif any(
|
455 |
number *= 10_000_000
|
456 |
-
elif 'cents' in groups:
|
457 |
number /= 100
|
|
|
458 |
except Exception:
|
459 |
continue
|
460 |
|
@@ -466,24 +482,78 @@ def extract_amounts(text: str):
|
|
466 |
"currency": currency_code
|
467 |
})
|
468 |
|
469 |
-
# Fallback
|
470 |
if not results:
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
|
484 |
return results
|
485 |
|
486 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
def insert_text_entry(data):
|
488 |
try:
|
489 |
conn = psycopg2.connect(DATABASE_URL)
|
@@ -491,17 +561,17 @@ def insert_text_entry(data):
|
|
491 |
|
492 |
insert_query = """
|
493 |
INSERT INTO user_entries (
|
494 |
-
uuid, raw_text, word_count, day_of_week, hour_of_day, month, year,
|
495 |
-
type, intent, confidence_scores, urgency_score,
|
496 |
time_mentions, parsed_dates, tense, summary,
|
497 |
people, mood, language, sentiment_score, tags,
|
498 |
-
action_required, entities, amounts, stores, processing_time_ms
|
499 |
) VALUES (
|
500 |
-
%(uuid)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
|
501 |
-
%(type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
|
502 |
%(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
|
503 |
%(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
|
504 |
-
%(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s
|
505 |
)
|
506 |
ON CONFLICT (uuid) DO NOTHING;
|
507 |
"""
|
@@ -511,7 +581,8 @@ def insert_text_entry(data):
|
|
511 |
"confidence_scores": Json(data["confidence_scores"]),
|
512 |
"language": Json(data["language"]),
|
513 |
"entities": Json(data["entities"]),
|
514 |
-
"amounts": Json(data["amounts"])
|
|
|
515 |
})
|
516 |
|
517 |
conn.commit()
|
@@ -576,10 +647,12 @@ async def analyze(input: TextInput):
|
|
576 |
|
577 |
scores = dict(zip(classification['labels'], classification['scores']))
|
578 |
# # Convert to short labels
|
579 |
-
|
580 |
label_map.get(label, label): score
|
581 |
for label, score in scores.items()
|
582 |
}
|
|
|
|
|
583 |
|
584 |
amounts = await asyncio.to_thread(extract_amounts, text)
|
585 |
parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
|
@@ -594,6 +667,9 @@ async def analyze(input: TextInput):
|
|
594 |
intent = infer_intent(best_label, text)
|
595 |
urgency_score = get_urgency_score(text, parsed_dates)
|
596 |
detected_stores = detect_store_category(text)
|
|
|
|
|
|
|
597 |
|
598 |
# Define action triggers
|
599 |
ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
|
@@ -608,6 +684,7 @@ async def analyze(input: TextInput):
|
|
608 |
|
609 |
result = {
|
610 |
"uuid": str(uuid.uuid4()), # Unique identifier for the request
|
|
|
611 |
"raw_text": text,
|
612 |
"word_count": meta["word_count"],
|
613 |
"day_of_week": meta["day_of_week"],
|
@@ -615,6 +692,7 @@ async def analyze(input: TextInput):
|
|
615 |
"month": meta["month"],
|
616 |
"year": meta["year"],
|
617 |
"type": best_label,
|
|
|
618 |
"intent": intent,
|
619 |
"confidence_scores": confidence_scores,
|
620 |
"urgency_score": urgency_score,
|
@@ -633,12 +711,21 @@ async def analyze(input: TextInput):
|
|
633 |
"stores": detected_stores,
|
634 |
"processing_time_ms": processing_time_ms
|
635 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
636 |
|
637 |
# Insert into database
|
638 |
await asyncio.to_thread(insert_text_entry, result)
|
639 |
|
640 |
# Log the result
|
641 |
-
print("✅ Analysis complete
|
|
|
|
|
|
|
642 |
|
643 |
# Return the result as JSON response
|
644 |
return ORJSONResponse(content=result)
|
|
|
24 |
from psycopg2.extras import Json
|
25 |
import os
|
26 |
from dotenv import load_dotenv
|
27 |
+
import random
|
28 |
|
29 |
# Load environment variables
|
30 |
load_dotenv()
|
|
|
43 |
CREATE_TABLE_QUERY = """
|
44 |
CREATE TABLE IF NOT EXISTS user_entries (
|
45 |
uuid UUID PRIMARY KEY,
|
46 |
+
user_id TEXT,
|
47 |
+
user_name TEXT,
|
48 |
+
uese_email TEXT,
|
49 |
raw_text TEXT,
|
50 |
word_count INT,
|
51 |
day_of_week TEXT,
|
|
|
53 |
month TEXT,
|
54 |
year INT,
|
55 |
type TEXT,
|
56 |
+
expense_type TEXT,
|
57 |
intent TEXT,
|
58 |
confidence_scores JSONB,
|
59 |
urgency_score INT,
|
|
|
71 |
amounts JSONB,
|
72 |
stores TEXT[],
|
73 |
processing_time_ms INT,
|
74 |
+
raw_json JSONB,
|
75 |
created_at TIMESTAMPTZ DEFAULT now()
|
76 |
);
|
77 |
"""
|
|
|
425 |
# Function to extract amounts in various currencies from text
|
426 |
def extract_amounts(text: str):
|
427 |
currency_patterns = [
|
428 |
+
# INR variants
|
429 |
+
(re.compile(r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"),
|
430 |
+
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:₹|rs\.?|inr)"), "INR"),
|
431 |
+
(re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)"), "INR"),
|
432 |
+
# USD variants
|
433 |
+
(re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"),
|
434 |
+
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"),
|
435 |
+
(re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"),
|
436 |
+
(re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"),
|
437 |
+
# EUR variants
|
438 |
+
(re.compile(r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"),
|
439 |
+
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?€"), "EUR"),
|
440 |
+
(re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"),
|
441 |
+
# GBP variants
|
442 |
+
(re.compile(r"(?:£|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"),
|
443 |
+
(re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?£"), "GBP"),
|
444 |
+
(re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"),
|
445 |
+
# INR large units
|
446 |
+
(re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)"), "INR"),
|
447 |
+
(re.compile(r"(\d+(?:\.\d+)?)\s?(crores?|crs?|cr)"), "INR"),
|
448 |
]
|
449 |
|
450 |
results = []
|
|
|
452 |
text_lower = text.lower()
|
453 |
|
454 |
for pattern, currency_code in currency_patterns:
|
455 |
+
for match in pattern.finditer(text_lower):
|
456 |
groups = match.groups()
|
457 |
raw_number = next((g for g in groups if re.match(r"\d", g)), None)
|
458 |
if not raw_number:
|
459 |
continue
|
460 |
+
# Ignore phone numbers and IDs (10+ digits)
|
461 |
+
if len(raw_number.replace(",", "")) >= 10:
|
462 |
+
continue
|
463 |
try:
|
464 |
number = float(raw_number.replace(",", ""))
|
465 |
+
|
466 |
+
# Check for lakh/crore/cents multipliers
|
467 |
+
if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups):
|
468 |
number *= 100_000
|
469 |
+
elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups):
|
470 |
number *= 10_000_000
|
471 |
+
elif any(g == 'cents' for g in groups):
|
472 |
number /= 100
|
473 |
+
|
474 |
except Exception:
|
475 |
continue
|
476 |
|
|
|
482 |
"currency": currency_code
|
483 |
})
|
484 |
|
485 |
+
# Fallback matching for generic numeric phrases near expense keywords
|
486 |
if not results:
|
487 |
+
fallback_patterns = [
|
488 |
+
re.compile(
|
489 |
+
r"\b(?:paid|spent|buy|purchase|cost|price|add(?:ed)?|gift(?:ed)?|bill(?: of)?|recharge(?:d)?|charged|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)"
|
490 |
+
),
|
491 |
+
re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs|inr)?")
|
492 |
+
]
|
493 |
+
for fallback_pattern in fallback_patterns:
|
494 |
+
match = fallback_pattern.search(text_lower)
|
495 |
+
if match:
|
496 |
+
number_str = match.group(1).replace(",", "")
|
497 |
+
# Ignore phone numbers and IDs
|
498 |
+
if len(number_str) >= 10:
|
499 |
+
continue
|
500 |
+
try:
|
501 |
+
number = float(number_str)
|
502 |
+
key = (number, "INR")
|
503 |
+
if key not in seen:
|
504 |
+
seen.add(key)
|
505 |
+
results.append({
|
506 |
+
"value": round(number, 2),
|
507 |
+
"currency": "INR"
|
508 |
+
})
|
509 |
+
break # Only extract first match in fallback
|
510 |
+
except:
|
511 |
+
continue
|
512 |
|
513 |
return results
|
514 |
|
515 |
|
516 |
+
def predict_expense_category(text, detected_stores):
|
517 |
+
text_lower = text.lower()
|
518 |
+
|
519 |
+
# 1. Use detected store category if available
|
520 |
+
if detected_stores:
|
521 |
+
best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0))
|
522 |
+
return best_match["category"]
|
523 |
+
|
524 |
+
# Category keyword mapping
|
525 |
+
category_keywords = {
|
526 |
+
"food": ["food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald"],
|
527 |
+
"transport": ["uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto"],
|
528 |
+
"shopping": ["amazon", "flipkart", "myntra", "shopping", "clothes", "apparel", "shoes", "jeans", "tshirt", "store", "fashion"],
|
529 |
+
"housing": ["rent", "apartment", "house", "flat", "maintenance", "landlord"],
|
530 |
+
"utilities": ["electricity", "power", "water", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio"],
|
531 |
+
"entertainment": ["movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium"],
|
532 |
+
"health": ["medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup"],
|
533 |
+
"travel": ["trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra"],
|
534 |
+
"education": ["course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill"],
|
535 |
+
"digital_services": ["domain", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas"],
|
536 |
+
"gifts_donations": ["gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift"],
|
537 |
+
"finance": ["insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto"],
|
538 |
+
"family_kids": ["kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche"],
|
539 |
+
"stationery": [
|
540 |
+
"pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery",
|
541 |
+
"register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box",
|
542 |
+
"stapler", "ink", "printer paper", "stationary shop", "stationary"
|
543 |
+
]
|
544 |
+
}
|
545 |
+
|
546 |
+
# 2. Match using keyword scores
|
547 |
+
matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()}
|
548 |
+
best_match = max(matched.items(), key=lambda x: x[1])
|
549 |
+
|
550 |
+
if best_match[1] > 0:
|
551 |
+
return best_match[0]
|
552 |
+
|
553 |
+
return "miscellaneous"
|
554 |
+
|
555 |
+
|
556 |
+
|
557 |
def insert_text_entry(data):
|
558 |
try:
|
559 |
conn = psycopg2.connect(DATABASE_URL)
|
|
|
561 |
|
562 |
insert_query = """
|
563 |
INSERT INTO user_entries (
|
564 |
+
uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year,
|
565 |
+
type, expense_type, intent, confidence_scores, urgency_score,
|
566 |
time_mentions, parsed_dates, tense, summary,
|
567 |
people, mood, language, sentiment_score, tags,
|
568 |
+
action_required, entities, amounts, stores, processing_time_ms, raw_json
|
569 |
) VALUES (
|
570 |
+
%(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
|
571 |
+
%(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
|
572 |
%(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
|
573 |
%(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
|
574 |
+
%(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s
|
575 |
)
|
576 |
ON CONFLICT (uuid) DO NOTHING;
|
577 |
"""
|
|
|
581 |
"confidence_scores": Json(data["confidence_scores"]),
|
582 |
"language": Json(data["language"]),
|
583 |
"entities": Json(data["entities"]),
|
584 |
+
"amounts": Json(data["amounts"]),
|
585 |
+
"raw_json": Json(data["raw_json"])
|
586 |
})
|
587 |
|
588 |
conn.commit()
|
|
|
647 |
|
648 |
scores = dict(zip(classification['labels'], classification['scores']))
|
649 |
# # Convert to short labels
|
650 |
+
confidence_scores_full = {
|
651 |
label_map.get(label, label): score
|
652 |
for label, score in scores.items()
|
653 |
}
|
654 |
+
# Only keep top 2
|
655 |
+
confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2])
|
656 |
|
657 |
amounts = await asyncio.to_thread(extract_amounts, text)
|
658 |
parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
|
|
|
667 |
intent = infer_intent(best_label, text)
|
668 |
urgency_score = get_urgency_score(text, parsed_dates)
|
669 |
detected_stores = detect_store_category(text)
|
670 |
+
expense_category = ""
|
671 |
+
if best_label == "expense":
|
672 |
+
expense_category = predict_expense_category(text, detected_stores)
|
673 |
|
674 |
# Define action triggers
|
675 |
ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
|
|
|
684 |
|
685 |
result = {
|
686 |
"uuid": str(uuid.uuid4()), # Unique identifier for the request
|
687 |
+
"user_id": random.randint(1, 10), # Unique identifier for the request
|
688 |
"raw_text": text,
|
689 |
"word_count": meta["word_count"],
|
690 |
"day_of_week": meta["day_of_week"],
|
|
|
692 |
"month": meta["month"],
|
693 |
"year": meta["year"],
|
694 |
"type": best_label,
|
695 |
+
"expense_type": expense_category,
|
696 |
"intent": intent,
|
697 |
"confidence_scores": confidence_scores,
|
698 |
"urgency_score": urgency_score,
|
|
|
711 |
"stores": detected_stores,
|
712 |
"processing_time_ms": processing_time_ms
|
713 |
}
|
714 |
+
|
715 |
+
# Store a copy of result without raw_json to avoid circular reference
|
716 |
+
raw_json_copy = result.copy()
|
717 |
+
# Remove raw_json if present (shouldn't be, but for safety)
|
718 |
+
raw_json_copy.pop("raw_json", None)
|
719 |
+
result["raw_json"] = raw_json_copy
|
720 |
|
721 |
# Insert into database
|
722 |
await asyncio.to_thread(insert_text_entry, result)
|
723 |
|
724 |
# Log the result
|
725 |
+
print("✅ Analysis complete")
|
726 |
+
|
727 |
+
# Remove raw_json from response
|
728 |
+
result.pop("raw_json", None)
|
729 |
|
730 |
# Return the result as JSON response
|
731 |
return ORJSONResponse(content=result)
|