Spaces:

yourpartner
/

demospace

Running

App Files Files Community

Mitesh Koshiya commited on 19 days ago

Commit

e6976d6

1 Parent(s): 65c85da

Update expense specific things

Browse files

Files changed (1) hide show

main.py +129 -42

main.py CHANGED Viewed

@@ -24,6 +24,7 @@ import psycopg2
 from psycopg2.extras import Json
 import os
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
@@ -42,6 +43,9 @@ app.add_middleware(
 CREATE_TABLE_QUERY = """
 CREATE TABLE IF NOT EXISTS user_entries (
     uuid UUID PRIMARY KEY,
     raw_text TEXT,
     word_count INT,
     day_of_week TEXT,
@@ -49,6 +53,7 @@ CREATE TABLE IF NOT EXISTS user_entries (
     month TEXT,
     year INT,
     type TEXT,
     intent TEXT,
     confidence_scores JSONB,
     urgency_score INT,
@@ -66,6 +71,7 @@ CREATE TABLE IF NOT EXISTS user_entries (
     amounts JSONB,
     stores TEXT[],
     processing_time_ms INT,
     created_at TIMESTAMPTZ DEFAULT now()
 );
 """
@@ -419,22 +425,26 @@ def get_meta_info(text: str):
 # Function to extract amounts in various currencies from text
 def extract_amounts(text: str):
     currency_patterns = [
-        # ₹5000, Rs. 1200, INR 300
-        (r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)", "INR"),
-        # $250.75 or 250.75$
-        (r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)", "USD"),
-        (r"(\d[\d,]*(?:\.\d+)?)\s?\$", "USD"),
-        # €100 or 100€
-        (r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)", "EUR"),
-        (r"(\d[\d,]*(?:\.\d+)?)\s?€", "EUR"),
-        # Word-based currency
-        (r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)", "INR"),
-        (r"(\d+(?:\.\d+)?)\s?(dollars?)", "USD"),
-        (r"(\d+(?:\.\d+)?)\s?(euros?)", "EUR"),
-        (r"(\d+(?:\.\d+)?)\s?(cents?)", "USD"),
-        # Indian system
-        (r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)", "INR"),
-        (r"(\d+(?:\.\d+)?)\s?(crores?|cr)", "INR"),
     ]
     results = []
@@ -442,19 +452,25 @@ def extract_amounts(text: str):
     text_lower = text.lower()
     for pattern, currency_code in currency_patterns:
-        for match in re.finditer(pattern, text_lower):
             groups = match.groups()
             raw_number = next((g for g in groups if re.match(r"\d", g)), None)
             if not raw_number:
                 continue
             try:
                 number = float(raw_number.replace(",", ""))
-                if any(word in groups for word in ['lakh', 'lacs', 'lakhs']):
                     number *= 100_000
-                elif any(word in groups for word in ['crore', 'crores', 'cr']):
                     number *= 10_000_000
-                elif 'cents' in groups:
                     number /= 100
             except Exception:
                 continue
@@ -466,24 +482,78 @@ def extract_amounts(text: str):
                     "currency": currency_code
                 })
-    # Fallback: detect simple numeric amounts like "paid 500 for rent"
     if not results:
-        match = re.search(r"\b(?:paid|spent|buy|purchase|cost|price)\b.*?(\d{2,8})", text_lower)
-        if match:
-            try:
-                number = float(match.group(1).replace(",", ""))
-                key = (number, "INR")
-                if key not in seen:
-                    results.append({
-                        "value": round(number, 2),
-                        "currency": "INR"
-                    })
-            except:
-                pass
     return results
 def insert_text_entry(data):
     try:
         conn = psycopg2.connect(DATABASE_URL)
@@ -491,17 +561,17 @@ def insert_text_entry(data):
         insert_query = """
             INSERT INTO user_entries (
-                uuid, raw_text, word_count, day_of_week, hour_of_day, month, year,
-                type, intent, confidence_scores, urgency_score,
                 time_mentions, parsed_dates, tense, summary,
                 people, mood, language, sentiment_score, tags,
-                action_required, entities, amounts, stores, processing_time_ms
             ) VALUES (
-                %(uuid)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
-                %(type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
                 %(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
                 %(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
-                %(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s
             )
             ON CONFLICT (uuid) DO NOTHING;
         """
@@ -511,7 +581,8 @@ def insert_text_entry(data):
             "confidence_scores": Json(data["confidence_scores"]),
             "language": Json(data["language"]),
             "entities": Json(data["entities"]),
-            "amounts": Json(data["amounts"])
         })
         conn.commit()
@@ -576,10 +647,12 @@ async def analyze(input: TextInput):
     scores = dict(zip(classification['labels'], classification['scores']))
     # # Convert to short labels
-    confidence_scores = {
         label_map.get(label, label): score
         for label, score in scores.items()
     }
     amounts = await asyncio.to_thread(extract_amounts, text)
     parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
@@ -594,6 +667,9 @@ async def analyze(input: TextInput):
     intent = infer_intent(best_label, text)
     urgency_score = get_urgency_score(text, parsed_dates)
     detected_stores = detect_store_category(text)
     # Define action triggers
     ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
@@ -608,6 +684,7 @@ async def analyze(input: TextInput):
     result = {
         "uuid": str(uuid.uuid4()),  # Unique identifier for the request
         "raw_text": text,
         "word_count": meta["word_count"],
         "day_of_week": meta["day_of_week"],
@@ -615,6 +692,7 @@ async def analyze(input: TextInput):
         "month": meta["month"],
         "year": meta["year"],
         "type": best_label,
         "intent": intent,
         "confidence_scores": confidence_scores,
         "urgency_score": urgency_score,
@@ -633,12 +711,21 @@ async def analyze(input: TextInput):
         "stores": detected_stores,
         "processing_time_ms": processing_time_ms
     }
     # Insert into database
     await asyncio.to_thread(insert_text_entry, result)
     # Log the result
-    print("✅ Analysis complete:", result)
     # Return the result as JSON response
     return ORJSONResponse(content=result)

 from psycopg2.extras import Json
 import os
 from dotenv import load_dotenv
+import random
 # Load environment variables
 load_dotenv()
 CREATE_TABLE_QUERY = """
 CREATE TABLE IF NOT EXISTS user_entries (
     uuid UUID PRIMARY KEY,
+    user_id TEXT,
+    user_name TEXT,
+    uese_email TEXT,
     raw_text TEXT,
     word_count INT,
     day_of_week TEXT,
     month TEXT,
     year INT,
     type TEXT,
+    expense_type TEXT,
     intent TEXT,
     confidence_scores JSONB,
     urgency_score INT,
     amounts JSONB,
     stores TEXT[],
     processing_time_ms INT,
+    raw_json JSONB,
     created_at TIMESTAMPTZ DEFAULT now()
 );
 """
 # Function to extract amounts in various currencies from text
 def extract_amounts(text: str):
     currency_patterns = [
+        # INR variants
+        (re.compile(r"(?:₹|rs\.?|inr)\s?(\d[\d,]*(?:\.\d+)?)"), "INR"),
+        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?(?:₹|rs\.?|inr)"), "INR"),
+        (re.compile(r"(\d+(?:\.\d+)?)\s?(rupees?|rupaye|rupiye)"), "INR"),
+        # USD variants
+        (re.compile(r"(?:\$)\s?(\d[\d,]*(?:\.\d+)?)"), "USD"),
+        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?\$"), "USD"),
+        (re.compile(r"(\d+(?:\.\d+)?)\s?(dollars?)"), "USD"),
+        (re.compile(r"(\d+(?:\.\d+)?)\s?(cents?)"), "USD"),
+        # EUR variants
+        (re.compile(r"(?:€|eur)\s?(\d[\d,]*(?:\.\d+)?)"), "EUR"),
+        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?€"), "EUR"),
+        (re.compile(r"(\d+(?:\.\d+)?)\s?(euros?)"), "EUR"),
+        # GBP variants
+        (re.compile(r"(?:£|gbp)\s?(\d[\d,]*(?:\.\d+)?)"), "GBP"),
+        (re.compile(r"(\d[\d,]*(?:\.\d+)?)\s?£"), "GBP"),
+        (re.compile(r"(\d+(?:\.\d+)?)\s?(pounds?)"), "GBP"),
+        # INR large units
+        (re.compile(r"(\d+(?:\.\d+)?)\s?(lacs?|lakhs?)"), "INR"),
+        (re.compile(r"(\d+(?:\.\d+)?)\s?(crores?|crs?|cr)"), "INR"),
     ]
     results = []
     text_lower = text.lower()
     for pattern, currency_code in currency_patterns:
+        for match in pattern.finditer(text_lower):
             groups = match.groups()
             raw_number = next((g for g in groups if re.match(r"\d", g)), None)
             if not raw_number:
                 continue
+            # Ignore phone numbers and IDs (10+ digits)
+            if len(raw_number.replace(",", "")) >= 10:
+                continue
             try:
                 number = float(raw_number.replace(",", ""))
+                # Check for lakh/crore/cents multipliers
+                if any(g in ['lakh', 'lacs', 'lakhs'] for g in groups):
                     number *= 100_000
+                elif any(g in ['crore', 'crores', 'cr', 'crs'] for g in groups):
                     number *= 10_000_000
+                elif any(g == 'cents' for g in groups):
                     number /= 100
             except Exception:
                 continue
                     "currency": currency_code
                 })
+    # Fallback matching for generic numeric phrases near expense keywords
     if not results:
+        fallback_patterns = [
+            re.compile(
+                r"\b(?:paid|spent|buy|purchase|cost|price|add(?:ed)?|gift(?:ed)?|bill(?: of)?|recharge(?:d)?|charged|transfer(?:red)?)\b[^0-9]{0,10}(\d[\d,]*(?:\.\d+)?)"
+            ),
+            re.compile(r"\b(\d[\d,]{2,8})\b\s?(?:rs|inr)?")
+        ]
+        for fallback_pattern in fallback_patterns:
+            match = fallback_pattern.search(text_lower)
+            if match:
+                number_str = match.group(1).replace(",", "")
+                # Ignore phone numbers and IDs
+                if len(number_str) >= 10:
+                    continue
+                try:
+                    number = float(number_str)
+                    key = (number, "INR")
+                    if key not in seen:
+                        seen.add(key)
+                        results.append({
+                            "value": round(number, 2),
+                            "currency": "INR"
+                        })
+                        break  # Only extract first match in fallback
+                except:
+                    continue
     return results
+def predict_expense_category(text, detected_stores):
+    text_lower = text.lower()
+    # 1. Use detected store category if available
+    if detected_stores:
+        best_match = max(detected_stores, key=lambda s: s.get("confidence", 1.0))
+        return best_match["category"]
+    # Category keyword mapping
+    category_keywords = {
+        "food": ["food", "lunch", "dinner", "breakfast", "snacks", "swiggy", "zomato", "dominos", "pizza", "kfc", "mcdonald"],
+        "transport": ["uber", "ola", "taxi", "cab", "bus", "train", "metro", "flight", "auto"],
+        "shopping": ["amazon", "flipkart", "myntra", "shopping", "clothes", "apparel", "shoes", "jeans", "tshirt", "store", "fashion"],
+        "housing": ["rent", "apartment", "house", "flat", "maintenance", "landlord"],
+        "utilities": ["electricity", "power", "water", "gas", "bill", "recharge", "broadband", "wifi", "airtel", "jio"],
+        "entertainment": ["movie", "netflix", "hotstar", "bookmyshow", "spotify", "gaming", "youtube premium"],
+        "health": ["medicine", "hospital", "doctor", "clinic", "pharmacy", "tablet", "surgery", "checkup"],
+        "travel": ["trip", "travel", "tour", "vacation", "hotel", "airbnb", "booking.com", "goibibo", "makemytrip", "yatra"],
+        "education": ["course", "webinar", "class", "training", "workshop", "udemy", "coursera", "byjus", "unacademy", "skill"],
+        "digital_services": ["domain", "hosting", "license", "email", "software", "zoom", "notion", "figma", "aws", "google cloud", "saas"],
+        "gifts_donations": ["gift", "donation", "present", "charity", "ngo", "temple", "mandir", "birthday gift", "festival gift"],
+        "finance": ["insurance", "sip", "mutual fund", "stock", "demat", "zerodha", "investment", "trading", "upstox", "crypto"],
+        "family_kids": ["kid", "baby", "school", "daycare", "tuition", "books", "uniform", "toys", "creche"],
+        "stationery": [
+            "pen", "pencil", "notebook", "diary", "eraser", "sharpener", "paper", "stationery",
+            "register", "files", "file", "markers", "highlighter", "sticky notes", "geometry box",
+            "stapler", "ink", "printer paper", "stationary shop", "stationary"
+        ]
+    }
+    # 2. Match using keyword scores
+    matched = {cat: sum(1 for kw in kws if kw in text_lower) for cat, kws in category_keywords.items()}
+    best_match = max(matched.items(), key=lambda x: x[1])
+    if best_match[1] > 0:
+        return best_match[0]
+    return "miscellaneous"
 def insert_text_entry(data):
     try:
         conn = psycopg2.connect(DATABASE_URL)
         insert_query = """
             INSERT INTO user_entries (
+                uuid, user_id, raw_text, word_count, day_of_week, hour_of_day, month, year,
+                type, expense_type, intent, confidence_scores, urgency_score,
                 time_mentions, parsed_dates, tense, summary,
                 people, mood, language, sentiment_score, tags,
+                action_required, entities, amounts, stores, processing_time_ms, raw_json
             ) VALUES (
+                %(uuid)s, %(user_id)s, %(raw_text)s, %(word_count)s, %(day_of_week)s, %(hour_of_day)s, %(month)s, %(year)s,
+                %(type)s, %(expense_type)s, %(intent)s, %(confidence_scores)s, %(urgency_score)s,
                 %(time_mentions)s, %(parsed_dates)s, %(tense)s, %(summary)s,
                 %(people)s, %(mood)s, %(language)s, %(sentiment_score)s, %(tags)s,
+                %(action_required)s, %(entities)s, %(amounts)s, %(stores)s, %(processing_time_ms)s, %(raw_json)s
             )
             ON CONFLICT (uuid) DO NOTHING;
         """
             "confidence_scores": Json(data["confidence_scores"]),
             "language": Json(data["language"]),
             "entities": Json(data["entities"]),
+            "amounts": Json(data["amounts"]),
+            "raw_json": Json(data["raw_json"])
         })
         conn.commit()
     scores = dict(zip(classification['labels'], classification['scores']))
     # # Convert to short labels
+    confidence_scores_full = {
         label_map.get(label, label): score
         for label, score in scores.items()
     }
+    # Only keep top 2
+    confidence_scores = dict(sorted(confidence_scores_full.items(), key=lambda x: x[1], reverse=True)[:2])
     amounts = await asyncio.to_thread(extract_amounts, text)
     parsed_dates, time_mentions = await asyncio.to_thread(extract_dates_with_accuracy, text, amounts)
     intent = infer_intent(best_label, text)
     urgency_score = get_urgency_score(text, parsed_dates)
     detected_stores = detect_store_category(text)
+    expense_category = ""
+    if best_label == "expense":
+        expense_category = predict_expense_category(text, detected_stores)
     # Define action triggers
     ACTION_TRIGGERS = ["plan", "organize", "schedule", "remember", "book", "call", "follow up", "need to"]
     result = {
         "uuid": str(uuid.uuid4()),  # Unique identifier for the request
+        "user_id": random.randint(1, 10),  # Unique identifier for the request
         "raw_text": text,
         "word_count": meta["word_count"],
         "day_of_week": meta["day_of_week"],
         "month": meta["month"],
         "year": meta["year"],
         "type": best_label,
+        "expense_type": expense_category,
         "intent": intent,
         "confidence_scores": confidence_scores,
         "urgency_score": urgency_score,
         "stores": detected_stores,
         "processing_time_ms": processing_time_ms
     }
+    # Store a copy of result without raw_json to avoid circular reference
+    raw_json_copy = result.copy()
+    # Remove raw_json if present (shouldn't be, but for safety)
+    raw_json_copy.pop("raw_json", None)
+    result["raw_json"] = raw_json_copy
     # Insert into database
     await asyncio.to_thread(insert_text_entry, result)
     # Log the result
+    print("✅ Analysis complete")
+    # Remove raw_json from response
+    result.pop("raw_json", None)
     # Return the result as JSON response
     return ORJSONResponse(content=result)