Spaces:

PatternGroup5
/

pattern

Sleeping

App Files Files Community

sakshamlakhera commited on Jul 19

Commit

733fcd8

1 Parent(s): 050283f

Initial commit

Browse files

Files changed (30) hide show

Dockerfile +1 -1
Home.py +24 -0
LICENSE +21 -0
assets/.DS_Store +0 -0
assets/css/styles.css +57 -0
assets/modelWeights/best_model_onion_v1.pth +3 -0
assets/modelWeights/best_model_pear_v1.pth +3 -0
assets/modelWeights/best_model_strawberry_v1.pth +3 -0
assets/modelWeights/best_model_tomato_v1.pth +3 -0
assets/modelWeights/best_model_v1.pth +3 -0
assets/nlp/.DS_Store +0 -0
assets/nlp/WEIGHTS.md +0 -0
config.py +15 -0
model/.DS_Store +0 -0
model/__init__.py +0 -0
model/classifier.py +43 -0
model/recipe_search.py +139 -0
pages/1_Image_Classification.py +34 -0
pages/2_Variation_Detection.py +53 -0
pages/3_Recipe_Recommendation.py +90 -0
pages/4_Report.py +107 -0
scripts/.DS_Store +0 -0
scripts/CV/.DS_Store +0 -0
scripts/CV/script.ipynb +0 -0
scripts/NLP/nlp_colab.py +475 -0
scripts/NLP/processing_files_for_app.py +393 -0
scripts/NLP/search_script.py +216 -0
utils/.DS_Store +0 -0
utils/__init__.py +0 -0
utils/layout.py +33 -0

Dockerfile CHANGED Viewed

@@ -18,4 +18,4 @@ EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]


18
19	HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
21	+ ENTRYPOINT ["streamlit", "run", "Home.py", "--server.port=8501", "--server.address=0.0.0.0"]

Home.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import streamlit as st
+from utils.layout import set_custom_page_config, render_header
+with open("assets/css/styles.css") as f:
+    st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+set_custom_page_config()
+render_header()
+st.markdown("""
+<div class="about-box">
+    Welcome to our Smart Kitchen Assistant — a CSE555 Final Project developed by Group 5 (Saksham & Ahmed).
+    <br><br>
+    🔍 This tool leverages AI to assist in:
+    - Classifying images of vegetables and fruits.
+    - Detecting their variations (cut, whole, sliced).
+    - Recommending recipes based on natural language input.
+</div>
+### 🔗 Use the left sidebar to navigate between:
+- 🥦 Task A: Classification
+- 🧊 Task B: Variation Detection
+- 🧠 NLP Recipe Recommendation
+""", unsafe_allow_html=True)

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 azaher1215
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

assets/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/css/styles.css ADDED Viewed

	@@ -0,0 +1,57 @@

+body {
+    font-family: 'Segoe UI', sans-serif;
+}
+.block-container {
+    max-width: 900px;
+    margin: 0 auto;
+    padding: 2rem;
+}
+.project-header {
+    text-align: center;
+    margin-top: 1rem;
+    margin-bottom: 2rem;
+}
+.home-container {
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    height: 70vh;
+}
+.home-card {
+    background: #ffffff;
+    border-radius: 12px;
+    padding: 2rem;
+    box-shadow: 0 8px 20px rgba(0, 0, 0, 0.1);
+    max-width: 600px;
+    text-align: center;
+}
+.about-box {
+    background-color: #f1f3f6;
+    border-left: 5px solid #4a90e2;
+    padding: 1rem;
+    margin-bottom: 1.5rem;
+    border-radius: 6px;
+    font-size: 0.95rem;
+}
+img {
+    border-radius: 10px;
+}
+/* Reduce sidebar width */
+.css-1d391kg, .css-1d391kg > div {
+    width: 250px !important;
+}
+/* Standard text sizes */
+h1 { font-size: 2.2rem; }
+h2 { font-size: 1.5rem; }
+p, li { font-size: 1rem; }
+/* Sidebar tweaks */
+.css-1lcbmhc { padding-top: 2rem; }

assets/modelWeights/best_model_onion_v1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ce6d74a4b1ccf494999e60addc2f8995072eca00837eb77eabd71ee859a0023
+size 16343319

assets/modelWeights/best_model_pear_v1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07e5a67e49f46112e14f0e533c7df4edaf4562ebbffcf65393f0d8bd130a8a37
+size 16342953

assets/modelWeights/best_model_strawberry_v1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:deed87390d881b658d39db29ec6e1850bf6c09bbf47882bd611a3a1de821fe4e
+size 16345405

assets/modelWeights/best_model_tomato_v1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb1db959f9732f49d95d174a6ba01da3271f57f5169b8af94a01abff7e78d329
+size 16343685

assets/modelWeights/best_model_v1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6aced5beaeea31c3cf030c250bbaf4c4c3f8d644b4dda6db5d21b4358d27b994
+size 16346243

assets/nlp/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

assets/nlp/WEIGHTS.md ADDED Viewed

File without changes

config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+CLASS_LABELS = ['onion', 'pear', 'strawberry', 'tomato']
+MODEL_PATH = "assets/modelWeights/best_model_v1.pth"
+MODEL_PATH_ONION = "assets/modelWeights/best_model_onion_v1.pth"
+MODEL_PATH_PEAR  = "assets/modelWeights/best_model_pear_v1.pth"
+MODEL_PATH_TOMATO  = "assets/modelWeights/best_model_tomato_v1.pth"
+MODEL_PATH_STRAWBERRY  = "assets/modelWeights/best_model_strawberry_v1.pth"
+GOOGLE_DRIVE_FILES = {
+    'assets/nlp/torch_recipe_embeddings_231630.pt': '1PSidY1toSfgECXDxa4pGza56Jq6vOq6t',
+    'assets/nlp/tag_based_bert_model.pth': '1LBl7yFs5JFqOsgfn88BF9g83W9mxiBm6',
+    'assets/nlp/RAW_recipes.csv': '1rFJQzg_ErwEpN6WmhQ4jRyiXv6JCINyf',
+    'assets/nlp/recipe_statistics_231630.pkl': '1n8TNT-6EA_usv59CCCU1IXqtuM7i084E',
+    'assets/nlp/recipe_scores_231630.pkl': '1gfPBzghKHOZqgJu4VE9NkandAd6FGjrA'
+}

model/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

model/__init__.py ADDED Viewed

File without changes

model/classifier.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import torch.nn as nn
+from typing import Tuple, List
+from torchvision import models, transforms
+from PIL import Image
+from config import CLASS_LABELS, MODEL_PATH
+import torch.nn.functional as F
+def get_model():
+    model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
+    model.classifier[1] = nn.Linear(model.classifier[1].in_features, len(CLASS_LABELS))
+    model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))
+    model.eval()
+    return model
+def get_model_by_name(model_path: str, num_classes: int):
+    model = models.efficientnet_b0(weights=None)
+    model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
+    model.load_state_dict(torch.load(model_path, map_location='cpu'))
+    model.eval()
+    return model
+def predict(image: Image.Image, model, class_labels: List[str] = None) -> Tuple[str, float]:
+    transform = transforms.Compose([
+        transforms.Resize((224, 224)),
+        transforms.ToTensor()
+    ])
+    image_tensor = transform(image).unsqueeze(0)
+    with torch.no_grad():
+        output = model(image_tensor)
+        probabilities = F.softmax(output, dim=1)
+        confidence, pred = torch.max(probabilities, dim=1)
+    print(pred)
+    if class_labels is None:
+        class_labels = CLASS_LABELS
+    return class_labels[pred.item()], confidence.item()

model/recipe_search.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import csv
+import ast
+import pickle
+import gdown
+import torch
+import torch.nn.functional as F
+import streamlit as st
+from transformers import BertTokenizer, BertModel
+from config import GOOGLE_DRIVE_FILES
+def download_file_from_drive(file_id: str, destination: str, file_name: str) -> bool:
+    try:
+        with st.spinner(f"Downloading {file_name}..."):
+            url = f"https://drive.google.com/uc?id={file_id}"
+            gdown.download(url, destination, quiet=False)
+        return True
+    except Exception as e:
+        st.error(f"Failed to download {file_name}: {e}")
+        return False
+def ensure_files_downloaded():
+    for filename, file_id in GOOGLE_DRIVE_FILES.items():
+        if not os.path.exists(filename):
+            success = download_file_from_drive(file_id, filename, filename)
+            if not success:
+                return False
+    return True
+class GoogleDriveRecipeSearch:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        if not ensure_files_downloaded():
+            self.is_ready = False
+            return
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.model = BertModel.from_pretrained("bert-base-uncased")
+        if os.path.exists("assets/nlp/tag_based_bert_model.pth"):
+            self.model.load_state_dict(
+                torch.load("assets/nlp/tag_based_bert_model.pth", map_location=self.device)
+            )
+            st.success("Trained model loaded successfully!")
+        else:
+            st.warning("Using untrained model")
+        self.model.to(self.device)
+        self.model.eval()
+        self.load_data()
+        self.is_ready = True
+    def load_data(self):
+        self.recipe_embeddings = torch.load("assets/nlp/torch_recipe_embeddings_231630.pt", map_location=self.device)
+        self.recipes = self._load_recipes("assets/nlp/RAW_recipes.csv")
+        self.recipe_stats = pickle.load(open("assets/nlp/recipe_statistics_231630.pkl", "rb"))
+        self.recipe_scores = pickle.load(open("assets/nlp/recipe_scores_231630.pkl", "rb"))
+    def _load_recipes(self, path):
+        recipes = []
+        with open(path, "r", encoding="utf-8") as file:
+            reader = csv.DictReader(file)
+            for idx, row in enumerate(reader):
+                name = row.get("name", "").strip()
+                if not name or name.lower() in ["nan", "unknown recipe"]:
+                    continue
+                try:
+                    recipe = {
+                        "id": int(row.get("id", idx)),
+                        "name": name,
+                        "ingredients": ast.literal_eval(row.get("ingredients", "[]")),
+                        "tags": ast.literal_eval(row.get("tags", "[]")),
+                        "minutes": int(float(row.get("minutes", 0))),
+                        "n_steps": int(float(row.get("n_steps", 0))),
+                        "description": row.get("description", ""),
+                        "steps": ast.literal_eval(row.get("steps", "[]"))
+                    }
+                    recipes.append(recipe)
+                except:
+                    continue
+        return recipes
+    def search_recipes(self, query, num_results=5, min_rating=3.0):
+        if not query.strip():
+            return []
+        print('im here')
+        tokens = self.tokenizer(query, return_tensors="pt", truncation=True, padding=True)
+        tokens = {k: v.to(self.device) for k, v in tokens.items()}
+        with torch.no_grad():
+            outputs = self.model(**tokens)
+            query_embedding = outputs.last_hidden_state[:, 0, :]
+        query_embedding = F.normalize(query_embedding, dim=1)
+        recipe_embeddings = F.normalize(self.recipe_embeddings, dim=1)
+        similarity_scores = torch.matmul(recipe_embeddings, query_embedding.T).squeeze()
+        final_scores = []
+        for i in range(len(self.recipe_embeddings)):
+            recipe = self.recipes[i]
+            avg_rating, num_ratings, *_ = self.recipe_stats.get(recipe["id"], (0.0, 0, 0))
+            if avg_rating < min_rating or num_ratings < 2:
+                continue
+            combined_score = (
+                0.6 * similarity_scores[i].item() +
+                0.4 * self.recipe_scores.get(recipe["id"], 0)
+            )
+            final_scores.append((combined_score, i))
+        top_matches = sorted(final_scores, key=lambda x: x[0], reverse=True)[:num_results]
+        results = []
+        for score, idx in top_matches:
+            recipe = self.recipes[idx]
+            avg_rating, num_ratings, *_ = self.recipe_stats.get(recipe["id"], (0.0, 0, 0))
+            results.append({
+                "name": recipe["name"],
+                "tags": recipe.get("tags", []),
+                "ingredients": recipe.get("ingredients", []),
+                "minutes": recipe.get("minutes", 0),
+                "n_steps": recipe.get("n_steps", 0),
+                "avg_rating": avg_rating,
+                "num_ratings": num_ratings,
+                "similarity_score": similarity_scores[idx].item(),
+                "combined_score": score,
+                "steps": recipe.get("steps", []),
+                "description": recipe.get("description", "")
+            })
+        return results
+@st.cache_resource
+def load_search_system():
+    return GoogleDriveRecipeSearch()

pages/1_Image_Classification.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from utils.layout import render_layout
+import streamlit as st
+from PIL import Image
+from model.classifier import get_model, predict
+def classification_page():
+    st.markdown("## 🖼️ Task A: Image Classification")
+    st.markdown("""
+    <div class="about-box">
+    This module classifies images into <b>Onion, Pear, Strawberry, or Tomato</b>
+    using an EfficientNet-B0 model.
+    </div>
+    """, unsafe_allow_html=True)
+    model = load_model()
+    uploaded = st.file_uploader("📤 Upload an image (JPG/PNG)", type=["jpg", "jpeg", "png"])
+    if uploaded:
+        img = Image.open(uploaded).convert("RGB")
+        label, confidence = predict(img, model)
+        print(label)
+        st.success(f"🎯 Prediction: **{label.upper()}** ({confidence*100:.2f}% confidence)")
+        st.markdown("<div style='text-align: center;'>", unsafe_allow_html=True)
+        st.image(img, caption="Uploaded Image", width=300)
+        st.markdown("</div>", unsafe_allow_html=True)
+@st.cache_resource
+def load_model():
+    return get_model()
+render_layout(classification_page)

pages/2_Variation_Detection.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from utils.layout import render_layout
+import streamlit as st
+from PIL import Image
+from model.classifier import predict, get_model_by_name
+import utils.config as config
+VARIATION_CLASS_MAP = {
+    "Onion": ['halved', 'sliced', 'whole'],
+    "Strawberry": ['Hulled', 'sliced', 'whole'],
+    "Tomato": ['diced', 'vines', 'whole'],
+    "Pear": ['halved', 'sliced', 'whole']
+}
+MODEL_PATH_MAP = {
+    "Onion": config.MODEL_PATH_ONION,
+    "Pear": config.MODEL_PATH_PEAR,
+    "Strawberry": config.MODEL_PATH_STRAWBERRY,
+    "Tomato": config.MODEL_PATH_TOMATO
+}
+@st.cache_resource
+def load_model(product_name):
+    model_path = MODEL_PATH_MAP[product_name]
+    num_classes = len(VARIATION_CLASS_MAP[product_name])
+    return get_model_by_name(model_path, num_classes=num_classes)
+def variation_detection_page():
+    st.markdown("## 🔍 Task B: Variation Detection")
+    st.markdown("""
+    <div class="about-box">
+    This module detects variations such as <code>Whole</code>, <code>Halved</code>, <code>Diced</code>, etc.
+    for Onion, Pear, Strawberry, and Tomato using individually fine-tuned models.
+    </div>
+    """, unsafe_allow_html=True)
+    product = st.selectbox("Select Product Type", list(MODEL_PATH_MAP.keys()))
+    model = load_model(product)
+    class_labels = VARIATION_CLASS_MAP[product]
+    uploaded = st.file_uploader("📤 Upload an image (JPG/PNG)", type=["jpg", "jpeg", "png"])
+    if uploaded:
+        img = Image.open(uploaded).convert("RGB")
+        label, confidence = predict(img, model, class_labels=class_labels)
+        st.success(f"🔍 Detected Variation: **{label}** ({confidence * 100:.2f}% confidence)")
+        st.markdown("<div style='text-align: center;'>", unsafe_allow_html=True)
+        st.image(img, caption=f"Uploaded Image - {product}", width=300)
+        st.markdown("</div>", unsafe_allow_html=True)
+render_layout(variation_detection_page)

pages/3_Recipe_Recommendation.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from utils.layout import render_layout
+import streamlit as st
+import time
+from model.recipe_search import load_search_system  # assumed you modularized this logic
+import streamlit.components.v1 as components
+def recipe_search_page():
+    st.markdown("""
+        ## 🍽️ Advanced Recipe Recommendation
+        <div class="about-box">
+        This module uses a custom-trained BERT model to semantically search recipes
+        based on your query, ingredients, and tags.
+        </div>
+    """, unsafe_allow_html=True)
+    if 'search_system' not in st.session_state:
+        with st.spinner("🔄 Initializing recipe search system..."):
+            st.session_state.search_system = load_search_system()
+    search_system = st.session_state.search_system
+    if not search_system.is_ready:
+        st.error("❌ System not ready. Please check data files and try again.")
+        return
+    query = st.text_input(
+        "Search for recipes:",
+        placeholder="e.g., 'chicken pasta', 'vegetarian salad', 'chocolate dessert'"
+    )
+    col1, col2 = st.columns(2)
+    with col1:
+        num_results = st.slider("Number of results", 1, 15, 5)
+    with col2:
+        min_rating = st.slider("Minimum rating", 1.0, 5.0, 3.0, 0.1)
+    if st.button("🔍 Search Recipes") and query:
+        with st.spinner(f"Searching for '{query}'..."):
+            start = time.time()
+            print(query, num_results, min_rating)
+            results = search_system.search_recipes(query, num_results, min_rating)
+            elapsed = time.time() - start
+        if results:
+            st.markdown(f"### 🎯 Top {len(results)} recipe recommendations for: *'{query}'*")
+            st.markdown("<sub>📊 Sorted by best match using semantic search and popularity</sub>", unsafe_allow_html=True)
+            st.markdown("<hr>", unsafe_allow_html=True)
+            for i, recipe in enumerate(results, 1):
+                steps_html = "".join([f"<li>{step.strip().capitalize()}</li>" for step in recipe.get("steps", [])])
+                description = recipe.get("description", "").strip().capitalize()
+                html_code = f"""
+                <div style="margin-bottom: 24px; padding: 16px; border-radius: 12px; background-color: #fdfdfd; box-shadow: 0 2px 8px rgba(0,0,0,0.06); font-family: Arial, sans-serif;">
+                    <div style="font-size: 18px; font-weight: bold; color: #333;">🔝 {i}. {recipe['name']}</div>
+                    <div style="margin: 4px 0 8px 0; font-size: 14px; color: #555;">
+                        ⏱️ <b>{recipe['minutes']} min</b> &nbsp;&nbsp;|&nbsp;&nbsp; 🔥 <b>{recipe['n_steps']} steps</b> &nbsp;&nbsp;|&nbsp;&nbsp; ⭐ <b>{recipe['avg_rating']:.1f}/5.0</b>
+                        <span style="font-size: 12px; color: #999;">({recipe['num_ratings']} ratings)</span>
+                    </div>
+                    <div style="margin-bottom: 6px; font-size: 14px;">
+                        <b>🔍 Match Score:</b> <span style="color: #007acc; font-weight: bold;">{recipe['similarity_score']:.1%}</span>
+                        <span style="font-size: 12px; color: #888;">(query match)</span><br>
+                        <b>🏆 Overall Score:</b> <span style="color: green; font-weight: bold;">{recipe['combined_score']:.1%}</span>
+                        <span style="font-size: 12px; color: #888;">(match + popularity)</span>
+                    </div>
+                    <div style="margin-bottom: 6px;">
+                        <b>🏷️ Tags:</b><br>
+                        {" ".join([f"<span style='background:#eee;padding:4px 8px;border-radius:6px;margin:2px;display:inline-block;font-size:12px'>{tag}</span>" for tag in recipe['tags']])}
+                    </div>
+                    <div style="margin-bottom: 6px;">
+                        <b>🥘 Ingredients:</b><br>
+                        <span style="font-size: 13px; color: #444;">{', '.join(recipe['ingredients'][:8])}
+                        {'...' if len(recipe['ingredients']) > 8 else ''}</span>
+                    </div>
+                    {"<div style='margin-top: 10px; font-size: 13px; color: #333;'><b>📖 Description:</b><br>" + description + "</div>" if description else ""}
+                    {"<div style='margin-top: 10px; font-size: 13px;'><b>🧑‍🍳 Steps:</b><ol style='margin: 6px 0 0 18px; padding: 0;'>" + steps_html + "</ol></div>" if steps_html else ""}
+                </div>
+                """
+                components.html(html_code, height=360 + len(recipe.get("steps", [])) * 20)
+        else:
+            st.warning(f"😔 No recipes found for '{query}' with a minimum rating of {min_rating}/5.0.")
+render_layout(recipe_search_page)

pages/4_Report.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import streamlit as st
+def render_report():
+    st.title("📊 Recipe Search System Report")
+    st.markdown("""
+        ## Overview
+        This report summarizes the working of the **custom BERT-based Recipe Recommendation System**, dataset characteristics, scoring algorithm, and evaluation metrics.
+    """)
+    st.markdown("### 🔍 Query Embedding and Similarity Calculation")
+    st.latex(r"""
+        \text{Similarity}(q, r_i) = \cos(\hat{q}, \hat{r}_i) = \frac{\hat{q} \cdot \hat{r}_i}{\|\hat{q}\|\|\hat{r}_i\|}
+    """)
+    st.markdown("""
+        Here, $\\hat{q}$ is the BERT embedding of the query, and $\\hat{r}_i$ is the embedding of the i-th recipe.
+    """)
+    st.markdown("### 🏆 Final Score Calculation")
+    st.latex(r"""
+        \text{Score}_i = 0.6 \times \text{Similarity}_i + 0.4 \times \text{Popularity}_i
+    """)
+    st.markdown("### 📊 Dataset Summary")
+    st.markdown("""
+        - **Total Recipes:** 231,630
+        - **Average Tags per Recipe:** ~6
+        - **Ingredients per Recipe:** 3 to 20
+        - **Ratings Data:** Extracted from user interaction dataset
+    """)
+    st.markdown("### 🧪 Evaluation Strategy")
+    st.markdown("""
+        We use a combination of:
+        - Manual inspection
+        - Recipe diversity analysis
+        - Match vs rating correlation
+        - Qualitative feedback from test queries
+    """)
+    st.markdown("---")
+    st.markdown("© 2025 Your Name. All rights reserved.")
+# If using a layout wrapper:
+render_report()
+# LaTeX content as string
+latex_report = r"""
+\documentclass{article}
+\usepackage{amsmath}
+\usepackage{geometry}
+\geometry{margin=1in}
+\title{Recipe Recommendation System Report}
+\author{Saksham Lakhera}
+\date{\today}
+\begin{document}
+\maketitle
+\section*{Overview}
+This report summarizes the working of the \textbf{custom BERT-based Recipe Recommendation System}, dataset characteristics, scoring algorithm, and evaluation metrics.
+\section*{Query Embedding and Similarity Calculation}
+\[
+\text{Similarity}(q, r_i) = \cos(\hat{q}, \hat{r}_i) = \frac{\hat{q} \cdot \hat{r}_i}{\|\hat{q}\|\|\hat{r}_i\|}
+\]
+Here, $\hat{q}$ is the BERT embedding of the query, and $\hat{r}_i$ is the embedding of the i-th recipe.
+\section*{Final Score Calculation}
+\[
+\text{Score}_i = 0.6 \times \text{Similarity}_i + 0.4 \times \text{Popularity}_i
+\]
+\section*{Dataset Summary}
+\begin{itemize}
+  \item \textbf{Total Recipes:} 231,630
+  \item \textbf{Average Tags per Recipe:} $\sim$6
+  \item \textbf{Ingredients per Recipe:} 3 to 20
+  \item \textbf{Ratings Source:} User interaction dataset
+\end{itemize}
+\section*{Evaluation Strategy}
+We use a combination of:
+\begin{itemize}
+  \item Manual inspection
+  \item Recipe diversity analysis
+  \item Match vs rating correlation
+  \item Qualitative user feedback
+\end{itemize}
+\end{document}
+"""
+# ⬇️ Download button to get the .tex file
+st.markdown("### 📥 Download LaTeX Report")
+st.download_button(
+    label="Download LaTeX (.tex)",
+    data=latex_report,
+    file_name="recipe_report.tex",
+    mime="text/plain"
+)
+# 📤 Optional: Show the .tex content in the app
+with st.expander("📄 View LaTeX (.tex) File Content"):
+    st.code(latex_report, language="latex")

scripts/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

scripts/CV/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

scripts/CV/script.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

scripts/NLP/nlp_colab.py ADDED Viewed

	@@ -0,0 +1,475 @@

+import pandas as pd
+from ast import literal_eval
+from transformers import BertTokenizer, BertModel
+from torch import nn
+from torch.utils.data import Dataset, DataLoader
+import torch
+import os
+from sklearn.model_selection import train_test_split
+import random
+import re
+def clean_text(text):
+  #helper function to clean the text from whitespace, double spaces
+  # converts to lowercase and checks if the text is a string first to avoid errors
+  if not isinstance(text, str):
+    return ''
+  text = text.lower()
+  text = ' '.join(text.split())
+  return text.strip()
+def setup_tag_categories():
+    tag_categories = {
+        'cuisine': [
+            'italian', 'chinese', 'mexican', 'indian', 'french', 'greek', 'thai',
+            'japanese', 'american', 'european', 'asian', 'mediterranean', 'spanish',
+            'german', 'korean', 'vietnamese', 'turkish', 'moroccan', 'lebanese'
+        ],
+        'course': [
+            'main-dish', 'side-dishes', 'appetizers', 'desserts', 'breakfast',
+            'lunch', 'dinner', 'snacks', 'beverages', 'salads', 'soups'
+        ],
+        'main_ingredient': [
+            'chicken', 'beef', 'pork', 'fish', 'seafood', 'vegetables', 'fruit',
+            'pasta', 'rice', 'cheese', 'chocolate', 'potato', 'lamb', 'turkey',
+            'beans', 'nuts', 'eggs', 'tofu'
+        ],
+        'dietary': [
+            'vegetarian', 'vegan', 'gluten-free', 'low-carb', 'healthy', 'low-fat',
+            'diabetic', 'dairy-free', 'keto', 'paleo', 'whole30'
+        ],
+        'cooking_method': [
+            'oven', 'stove-top', 'no-cook', 'microwave', 'slow-cooker', 'grilling',
+            'baking', 'roasting', 'frying', 'steaming', 'braising'
+        ],
+        'difficulty': ['easy', 'beginner-cook', 'advanced', 'intermediate', 'quick'],
+        'time': [
+            '15-minutes-or-less', '30-minutes-or-less', '60-minutes-or-less',
+            '4-hours-or-less', 'weeknight'
+        ],
+        'occasion': [
+            'holiday-event', 'christmas', 'thanksgiving', 'valentines-day',
+            'summer', 'winter', 'spring', 'fall', 'party', 'picnic'
+        ]
+    }
+    return tag_categories
+def setup_ingredient_groups():
+    ingredient_groups = {
+        'proteins': [
+            'chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 'turkey',
+            'lamb', 'bacon', 'ham', 'sausage', 'eggs', 'tofu', 'beans', 'lentils'
+        ],
+        'vegetables': [
+            'onion', 'garlic', 'tomato', 'carrot', 'celery', 'pepper', 'mushroom',
+            'spinach', 'broccoli', 'zucchini', 'potato', 'sweet potato'
+        ],
+        'grains_starches': [
+            'rice', 'pasta', 'bread', 'flour', 'oats', 'quinoa', 'barley', 'noodles'
+        ],
+        'dairy': [
+            'milk', 'butter', 'cheese', 'cream', 'yogurt', 'sour cream', 'cream cheese'
+        ]
+    }
+    return ingredient_groups
+def categorize_recipe_tags(recipe_tags, tag_categories):
+    categorized_tags = {}
+    # Initialize empty lists for each category
+    for category_name in tag_categories.keys():
+        categorized_tags[category_name] = []
+    # Check each tag
+    for tag in recipe_tags:
+        tag_lower = tag.lower()
+        # Check each category
+        for category_name in tag_categories.keys():
+            category_keywords = tag_categories[category_name]
+            # Check if any keyword matches this tag
+            for keyword in category_keywords:
+                if keyword in tag_lower:
+                    categorized_tags[category_name].append(tag)
+                    break
+    return categorized_tags
+def extract_main_ingredients(ingredients_list, ingredient_groups):
+    if not ingredients_list or not isinstance(ingredients_list, list):
+        return []
+    # Clean each ingredient
+    cleaned_ingredients = []
+    for ingredient in ingredients_list:
+        # Convert to string
+        ingredient_string = str(ingredient) if ingredient is not None else ''
+        if not ingredient_string or ingredient_string == 'nan':
+            continue
+        # Make lowercase
+        cleaned_ingredient = ingredient_string.lower()
+        # Remove common descriptor words
+        words_to_remove = ['fresh', 'dried', 'chopped', 'minced', 'sliced', 'diced', 'ground', 'large', 'small', 'medium']
+        for word in words_to_remove:
+            cleaned_ingredient = cleaned_ingredient.replace(word, '')
+        # Remove numbers
+        cleaned_ingredient = re.sub(r'\d+', '', cleaned_ingredient)
+        # Remove measurement words
+        measurement_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'pound', 'pounds', 'ounce', 'ounces']
+        for measurement in measurement_words:
+            cleaned_ingredient = cleaned_ingredient.replace(measurement, '')
+        # Clean up extra spaces
+        cleaned_ingredient = re.sub(r'\s+', ' ', cleaned_ingredient).strip()
+        # Only keep if it's long enough
+        if cleaned_ingredient and len(cleaned_ingredient) > 2:
+            cleaned_ingredients.append(cleaned_ingredient)
+    # Put ingredients in order of importance
+    ordered_ingredients = []
+    # First, add proteins (most important)
+    for ingredient in cleaned_ingredients:
+        for protein in ingredient_groups['proteins']:
+            if protein in ingredient:
+                ordered_ingredients.append(ingredient)
+                break
+    # Then add vegetables, grains, and dairy
+    other_groups = ['vegetables', 'grains_starches', 'dairy']
+    for group_name in other_groups:
+        for ingredient in cleaned_ingredients:
+            if ingredient not in ordered_ingredients:
+                for group_item in ingredient_groups[group_name]:
+                    if group_item in ingredient:
+                        ordered_ingredients.append(ingredient)
+                        break
+    # Finally, add any remaining ingredients
+    for ingredient in cleaned_ingredients:
+        if ingredient not in ordered_ingredients:
+            ordered_ingredients.append(ingredient)
+    return ordered_ingredients
+def create_structured_recipe_text(recipe, tag_categories, ingredient_groups):
+    # Get recipe tags and categorize them
+    recipe_tags = recipe['tags'] if isinstance(recipe['tags'], list) else []
+    categorized_tags = categorize_recipe_tags(recipe_tags, tag_categories)
+    # Choose tags in priority order
+    priority_categories = ['main_ingredient', 'cuisine', 'course', 'dietary', 'cooking_method']
+    selected_tags = []
+    for category in priority_categories:
+        if category in categorized_tags:
+            # Take up to 2 tags from each category
+            category_tags = categorized_tags[category][:2]
+            for tag in category_tags:
+                selected_tags.append(tag)
+    # Add some additional important tags
+    important_keywords = ['easy', 'quick', 'healthy', 'spicy', 'sweet']
+    remaining_tags = []
+    for tag in recipe_tags:
+        if tag not in selected_tags:
+            for keyword in important_keywords:
+                if keyword in tag.lower():
+                    remaining_tags.append(tag)
+                    break
+    # Add up to 3 remaining tags
+    for i in range(min(3, len(remaining_tags))):
+        selected_tags.append(remaining_tags[i])
+    # Process ingredients
+    recipe_ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else []
+    main_ingredients = extract_main_ingredients(recipe_ingredients, ingredient_groups)
+    # Step 5: Create the final structured text
+    # Join first 8 ingredients
+    ingredients_text = ', '.join(main_ingredients[:8])
+    # Join first 10 tags
+    tags_text = ', '.join(selected_tags[:10])
+    # Get recipe name
+    recipe_name = str(recipe['name']).replace('  ', ' ').strip()
+    # Create final structured text
+    structured_text = f"Recipe: {recipe_name}. Ingredients: {ingredients_text}. Style: {tags_text}"
+    return structured_text
+def create_pair_data(recipes_df: pd.DataFrame, interactions_df: pd.DataFrame ,num_pairs: int = 15000):
+    # This function creates the training pairs for the model.
+    # we first analyzed the data to create catogeries for the tags and ingredients. Under each of these, we have a list for cuisine, dietery, poultry, etc.
+    # As we trained the model, we found that the model was not able to learn the tags and ingredients so we created a structured text represenation so it can easily learn.
+    # the prompt used is: Analyze the two csv files attached and created a structured text representation to be used for training a bert model to understand
+    # tags and ingredients such that if a user later searches for a quick recipe, it can be used to find a recipe that is quick to make.
+  # Set up the structured text categories and groups
+  tag_categories = setup_tag_categories()
+  ingredient_groups = setup_ingredient_groups()
+  # Make a list to store all our pairs
+  pair_data_list = []
+  # create the pairs
+  for pair_number in range(num_pairs):
+    #Pick a random recipe from our dataframe
+    random_recipe_data = recipes_df.iloc[random.randint(0, len(recipes_df) - 1)]
+    # Get the tags from this recipe
+    recipe_tags_list = random_recipe_data['tags']
+    # Select some random tags (maximum 5, but maybe less if recipe has fewer tags)
+    num_tags_to_select = min(5, len(recipe_tags_list))
+    selected_tags_list = []
+    # Pick random sample of tags and join them to a query string
+    selected_tags_list = random.sample(recipe_tags_list, num_tags_to_select)
+    # Create the positive recipe text using structured format
+    positive_recipe_text = create_structured_recipe_text(random_recipe_data, tag_categories, ingredient_groups)
+    # Find a negative recipe that has less than 2 tags in common with the query
+    anchor = ' '.join(selected_tags_list)
+    anchor_tags_set = set(anchor.split())
+    negative_recipe_text = None
+    attempts_counter = 0
+    max_attempts_allowed = 100
+    # Keep trying until we find a good negative recipe (Added a max attempts to avoid infinite loop)
+    while negative_recipe_text is None and attempts_counter < max_attempts_allowed:
+      random_negative_recipe = recipes_df.iloc[random.randint(0, len(recipes_df) - 1)]
+      # Get tags from this negative recipe
+      negative_recipe_tags = random_negative_recipe['tags']
+      negative_recipe_tags_set = set(negative_recipe_tags)
+      # Count how many tags overlap
+      overlap_count = 0
+      for anchor_tag in anchor_tags_set:
+        if anchor_tag in negative_recipe_tags_set:
+          overlap_count = overlap_count + 1
+      attempts_counter = attempts_counter + 1
+      # If overlap is small enough (2 or less), we can use this as negative
+      if overlap_count <= 2:
+        # Create the negative recipe text using structured format
+        negative_recipe_text = create_structured_recipe_text(random_negative_recipe, tag_categories, ingredient_groups)
+        print(f"Found all negative recipes. Overlap: {overlap_count}")
+        break
+    # If we found a negative recipe, add this pair to our list
+    if negative_recipe_text is not None:
+      # Create a tuple with the three parts
+      pair_data_list.append((anchor, positive_recipe_text, negative_recipe_text))
+      print(f"Created pair {pair_number + 1}: Anchor='{anchor}', Overlap={overlap_count}")
+    else:
+      print(f"Could not find negative recipe for anchor '{anchor}' after {max_attempts_allowed} attempts")
+    # Show progress every 1000 pairs
+    if (pair_number + 1) % 1000 == 0:
+      print(f"Progress: Created {pair_number + 1}/{num_pairs} pairs")
+  # Convert our list to a pandas DataFrame and return it
+  result_dataframe = pd.DataFrame(pair_data_list, columns=['anchor', 'positive', 'negative'])
+  print(f"Final result: Created {len(result_dataframe)} pairs total")
+  return result_dataframe
+class pos_neg_pair_dataset(Dataset):
+  #typical dataset class to tokenize for bert model and return the ids and masks
+  def __init__(self, pair_data, tokenizer, max_length=128):
+    self.pair_data = pair_data
+    self.tokenizer = tokenizer
+    self.max_length = max_length
+  def __len__(self):
+    return len(self.pair_data)
+  def __getitem__(self, idx):
+    anchor = self.tokenizer(
+      self.pair_data.iloc[idx]['anchor'],
+      return_tensors='pt',
+      truncation=True,
+      max_length=self.max_length,
+      padding='max_length')
+    positive = self.tokenizer(
+        self.pair_data.iloc[idx]['positive'],
+        return_tensors='pt',
+        truncation=True,
+        max_length=self.max_length,
+        padding='max_length')
+    negative = self.tokenizer(
+        self.pair_data.iloc[idx]['negative'],
+        return_tensors='pt',
+        truncation=True,
+        max_length=self.max_length,
+        padding='max_length')
+    return {
+      'anchor_input_ids': anchor['input_ids'].squeeze(),
+      'anchor_attention_mask': anchor['attention_mask'].squeeze(),
+      'positive_input_ids': positive['input_ids'].squeeze(),
+      'positive_attention_mask': positive['attention_mask'].squeeze(),
+      'negative_input_ids': negative['input_ids'].squeeze(),
+      'negative_attention_mask': negative['attention_mask'].squeeze()
+    }
+def evaluate_model(model, val_loader):
+    #evaluation method, same as training but with no gradient updates
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model.to(device)
+    model.eval()
+    total_loss = 0
+    criterion = nn.TripletMarginLoss(margin=1.0)
+    with torch.no_grad():
+        for batch in val_loader:
+            anchor_input_ids = batch['anchor_input_ids'].to(device)
+            anchor_attention_mask = batch['anchor_attention_mask'].to(device)
+            positive_input_ids = batch['positive_input_ids'].to(device)
+            positive_attention_mask = batch['positive_attention_mask'].to(device)
+            negative_input_ids = batch['negative_input_ids'].to(device)
+            negative_attention_mask = batch['negative_attention_mask'].to(device)
+            # Forward pass - get raw BERT embeddings
+            anchor_outputs = model(anchor_input_ids, anchor_attention_mask)
+            positive_outputs = model(positive_input_ids, positive_attention_mask)
+            negative_outputs = model(negative_input_ids, negative_attention_mask)
+            # Extract [CLS] token embeddings
+            anchor_emb = anchor_outputs.last_hidden_state[:, 0, :]
+            positive_emb = positive_outputs.last_hidden_state[:, 0, :]
+            negative_emb = negative_outputs.last_hidden_state[:, 0, :]
+            # Calculate loss
+            loss = criterion(anchor_emb, positive_emb, negative_emb)
+            total_loss += loss.item()
+    print(f"Average loss on validation set: {total_loss/len(val_loader):.4f}")
+def train_model(train_loader, num_epochs=3):
+    # initialize the model, criterion, and optimizer
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    model = BertModel.from_pretrained('bert-base-uncased')
+    model.to(device)
+    criterion = nn.TripletMarginLoss(margin=1.0)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
+    for epoch in range(num_epochs):
+        model.train()
+        total_loss = 0
+        for batch in train_loader:
+            #load the ids and masks to device
+            anchor_input_ids = batch['anchor_input_ids'].to(device)
+            anchor_attention_mask = batch['anchor_attention_mask'].to(device)
+            positive_input_ids = batch['positive_input_ids'].to(device)
+            positive_attention_mask = batch['positive_attention_mask'].to(device)
+            negative_input_ids = batch['negative_input_ids'].to(device)
+            negative_attention_mask = batch['negative_attention_mask'].to(device)
+            # get the embeddings to extract the [CLS] token embeddings
+            model(anchor_input_ids,anchor_attention_mask)
+            anchor_outputs = model(anchor_input_ids, anchor_attention_mask)
+            positive_outputs = model(positive_input_ids, positive_attention_mask)
+            negative_outputs = model(negative_input_ids, negative_attention_mask)
+            # Extract the[CLS] token embeddings
+            anchor_emb = anchor_outputs.last_hidden_state[:, 0, :]
+            positive_emb = positive_outputs.last_hidden_state[:, 0, :]
+            negative_emb = negative_outputs.last_hidden_state[:, 0, :]
+            # Calculate loss
+            loss = criterion(anchor_emb, positive_emb, negative_emb)
+            # Backward pass
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        # per batch average loss total loss / number of batches
+        print(f'Epoch {epoch+1}, Average Loss: {total_loss/len(train_loader):.4f}')
+    return model
+if __name__ == '__main__':
+  if not os.path.exists('pair_data.parquet'):
+    # Load and prepare the data
+    print("Loading recipe data")
+    recipes_df = pd.read_csv('RAW_recipes.csv')
+    # Clean the data
+    recipes_df['name'] = recipes_df['name'].apply(clean_text)
+    recipes_df['tags'] = recipes_df['tags'].apply(literal_eval)
+    recipes_df['ingredients'] = recipes_df['ingredients'].apply(literal_eval)
+    # Filter recipes with meaningful data (no empty tags)
+    recipes_df = recipes_df[recipes_df['tags'].str.len() > 0]
+    # Load interactions
+    print("Loading interaction data")
+    interactions_df = pd.read_csv('RAW_interactions.csv')
+    interactions_df = interactions_df.dropna(subset=['rating'])
+    interactions_df['rating'] = pd.to_numeric(interactions_df['rating'], errors='coerce')
+    interactions_df = interactions_df.dropna(subset=['rating'])
+    # Create training pairs
+    pair_data = create_pair_data(recipes_df, interactions_df, num_pairs=15000)
+    # Save the pair data
+    pair_data.to_parquet('pair_data.parquet', index=False)
+    print('Data saved to pair_data.parquet')
+  else:
+    pair_data = pd.read_parquet('pair_data.parquet')
+    print('Data loaded from pair_data.parquet')
+  # Split data to training and validation (80% training, 20% validation)
+  train_data, val_data = train_test_split(pair_data, test_size=0.2, random_state=42)
+# initialize tokenizer and model
+  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+  # Create the datasets with reduced max_length for better performance
+  train_dataset = pos_neg_pair_dataset(train_data, tokenizer, max_length=128)
+  val_dataset = pos_neg_pair_dataset(val_data, tokenizer, max_length=128)
+  # Create dataloaders with smaller batch size for stability
+  train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
+  val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
+  # Train model
+  print("Starting training...")
+  model = train_model(train_loader, num_epochs=3)
+  #evaluate the model
+  print("Evaluating model...")
+  evaluate_model(model, val_loader)
+  # Save model
+  torch.save(model.state_dict(), 'tag_based_bert_model.pth')
+  print("Model saved to tag_based_bert_model.pth")
+  print("Training Complete")

scripts/NLP/processing_files_for_app.py ADDED Viewed

	@@ -0,0 +1,393 @@

+import pandas as pd
+import torch
+import numpy as np
+from transformers import BertTokenizer, BertModel
+from ast import literal_eval
+import re
+import pickle
+from datetime import datetime
+def clean_text(text):
+    #helper function to clean the text from whitespace, double spaces
+    # converts to lowercase and checks if the text is a string first to avoid errors
+    if not isinstance(text, str):
+        return ''
+    text = text.lower()
+    text = ' '.join(text.split())
+    return text.strip()
+def setup_tag_categories():
+    tag_categories = {
+        'cuisine': [
+            'italian', 'chinese', 'mexican', 'indian', 'french', 'greek', 'thai',
+            'japanese', 'american', 'european', 'asian', 'mediterranean', 'spanish',
+            'german', 'korean', 'vietnamese', 'turkish', 'moroccan', 'lebanese'
+        ],
+        'course': [
+            'main-dish', 'side-dishes', 'appetizers', 'desserts', 'breakfast',
+            'lunch', 'dinner', 'snacks', 'beverages', 'salads', 'soups'
+        ],
+        'main_ingredient': [
+            'chicken', 'beef', 'pork', 'fish', 'seafood', 'vegetables', 'fruit',
+            'pasta', 'rice', 'cheese', 'chocolate', 'potato', 'lamb', 'turkey',
+            'beans', 'nuts', 'eggs', 'tofu'
+        ],
+        'dietary': [
+            'vegetarian', 'vegan', 'gluten-free', 'low-carb', 'healthy', 'low-fat',
+            'diabetic', 'dairy-free', 'keto', 'paleo', 'whole30'
+        ],
+        'cooking_method': [
+            'oven', 'stove-top', 'no-cook', 'microwave', 'slow-cooker', 'grilling',
+            'baking', 'roasting', 'frying', 'steaming', 'braising'
+        ],
+        'difficulty': ['easy', 'beginner-cook', 'advanced', 'intermediate', 'quick'],
+        'time': [
+            '15-minutes-or-less', '30-minutes-or-less', '60-minutes-or-less',
+            '4-hours-or-less', 'weeknight'
+        ],
+        'occasion': [
+            'holiday-event', 'christmas', 'thanksgiving', 'valentines-day',
+            'summer', 'winter', 'spring', 'fall', 'party', 'picnic'
+        ]
+    }
+    return tag_categories
+def setup_ingredient_groups():
+    ingredient_groups = {
+        'proteins': [
+            'chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 'turkey',
+            'lamb', 'bacon', 'ham', 'sausage', 'eggs', 'tofu', 'beans', 'lentils'
+        ],
+        'vegetables': [
+            'onion', 'garlic', 'tomato', 'carrot', 'celery', 'pepper', 'mushroom',
+            'spinach', 'broccoli', 'zucchini', 'potato', 'sweet potato'
+        ],
+        'grains_starches': [
+            'rice', 'pasta', 'bread', 'flour', 'oats', 'quinoa', 'barley', 'noodles'
+        ],
+        'dairy': [
+            'milk', 'butter', 'cheese', 'cream', 'yogurt', 'sour cream', 'cream cheese'
+        ]
+    }
+    return ingredient_groups
+def load_and_clean_recipes(recipes_path):
+    print(f"Loading recipes from {recipes_path}")
+    # Load the CSV file
+    recipes_df = pd.read_csv(recipes_path)
+    # Clean the recipe names
+    recipes_df['name'] = recipes_df['name'].fillna('unknown recipe').astype(str).apply(clean_text)
+    # Update the dataframe
+    recipes_df['description'] = recipes_df['description'].fillna('').astype(str).apply(clean_text)
+    # cleaning tags and ingredients from string format
+    recipes_df['tags'] = recipes_df['tags'].apply(literal_eval)
+    recipes_df['ingredients'] = recipes_df['ingredients'].apply(literal_eval)
+    # Filter out recipes with no tags or ingredients
+    recipes_df = recipes_df[
+        (recipes_df['tags'].str.len() > 0) &
+        (recipes_df['ingredients'].str.len() > 0) &
+        (recipes_df['name'].str.len() > 0) &
+        (recipes_df['name'] != 'unknown recipe')
+    ].reset_index(drop=True)
+    print(f"Final number of valid recipes: {len(recipes_df)}")
+    return recipes_df
+def categorize_recipe_tags(recipe_tags, tag_categories):
+    categorized_tags = {}
+    # Initialize empty lists for each category
+    for category_name in tag_categories.keys():
+        categorized_tags[category_name] = []
+    # Check each tag
+    for tag in recipe_tags:
+        tag_lower = tag.lower()
+        # Check each category
+        for category_name in tag_categories.keys():
+            category_keywords = tag_categories[category_name]
+            # Check if any keyword matches this tag
+            for keyword in category_keywords:
+                if keyword in tag_lower:
+                    categorized_tags[category_name].append(tag)
+                    break
+    return categorized_tags
+def extract_main_ingredients(ingredients_list, ingredient_groups):
+    if not ingredients_list or not isinstance(ingredients_list, list):
+        return []
+    # Clean each ingredient
+    cleaned_ingredients = []
+    for ingredient in ingredients_list:
+        # Convert to string
+        ingredient_string = str(ingredient) if ingredient is not None else ''
+        if not ingredient_string or ingredient_string == 'nan':
+            continue
+        # Make lowercase
+        cleaned_ingredient = ingredient_string.lower()
+        # Remove common descriptor words
+        words_to_remove = ['fresh', 'dried', 'chopped', 'minced', 'sliced', 'diced', 'ground', 'large', 'small', 'medium']
+        for word in words_to_remove:
+            cleaned_ingredient = cleaned_ingredient.replace(word, '')
+        # Remove numbers
+        cleaned_ingredient = re.sub(r'\d+', '', cleaned_ingredient)
+        # Remove measurement words
+        measurement_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'pound', 'pounds', 'ounce', 'ounces']
+        for measurement in measurement_words:
+            cleaned_ingredient = cleaned_ingredient.replace(measurement, '')
+        # Clean up extra spaces
+        cleaned_ingredient = re.sub(r'\s+', ' ', cleaned_ingredient).strip()
+        # Only keep if it's long enough
+        if cleaned_ingredient and len(cleaned_ingredient) > 2:
+            cleaned_ingredients.append(cleaned_ingredient)
+    # Put ingredients in order of importance
+    ordered_ingredients = []
+    # First, add proteins (most important)
+    for ingredient in cleaned_ingredients:
+        for protein in ingredient_groups['proteins']:
+            if protein in ingredient:
+                ordered_ingredients.append(ingredient)
+                break
+    # Then add vegetables, grains, and dairy
+    other_groups = ['vegetables', 'grains_starches', 'dairy']
+    for group_name in other_groups:
+        for ingredient in cleaned_ingredients:
+            if ingredient not in ordered_ingredients:
+                for group_item in ingredient_groups[group_name]:
+                    if group_item in ingredient:
+                        ordered_ingredients.append(ingredient)
+                        break
+    # Finally, add any remaining ingredients
+    for ingredient in cleaned_ingredients:
+        if ingredient not in ordered_ingredients:
+            ordered_ingredients.append(ingredient)
+    return ordered_ingredients
+def create_structured_recipe_text(recipe, tag_categories, ingredient_groups):
+    # Get recipe tags and categorize them
+    recipe_tags = recipe['tags'] if isinstance(recipe['tags'], list) else []
+    categorized_tags = categorize_recipe_tags(recipe_tags, tag_categories)
+    # Choose tags in priority order
+    priority_categories = ['main_ingredient', 'cuisine', 'course', 'dietary', 'cooking_method']
+    selected_tags = []
+    for category in priority_categories:
+        if category in categorized_tags:
+            # Take up to 2 tags from each category
+            category_tags = categorized_tags[category][:2]
+            for tag in category_tags:
+                selected_tags.append(tag)
+    # Add some additional important tags
+    important_keywords = ['easy', 'quick', 'healthy', 'spicy', 'sweet']
+    remaining_tags = []
+    for tag in recipe_tags:
+        if tag not in selected_tags:
+            for keyword in important_keywords:
+                if keyword in tag.lower():
+                    remaining_tags.append(tag)
+                    break
+    # Add up to 3 remaining tags
+    for i in range(min(3, len(remaining_tags))):
+        selected_tags.append(remaining_tags[i])
+    # Process ingredients
+    recipe_ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else []
+    main_ingredients = extract_main_ingredients(recipe_ingredients, ingredient_groups)
+    # Step 5: Create the final structured text
+    # Join first 8 ingredients
+    ingredients_text = ', '.join(main_ingredients[:8])
+    # Join first 10 tags
+    tags_text = ', '.join(selected_tags[:10])
+    # Get recipe name
+    recipe_name = str(recipe['name']).replace('  ', ' ').strip()
+    # Create final structured text
+    structured_text = f"Recipe: {recipe_name}. Ingredients: {ingredients_text}. Style: {tags_text}"
+    return structured_text
+def create_recipe_statistics(interactions_path='RAW_interactions.csv'):
+    print("Creating recipe statistics")
+    # Load interactions data
+    interactions_df = pd.read_csv(interactions_path)
+    # Clean interactions data
+    interactions_df = interactions_df.dropna(subset=['rating'])
+    # Convert ratings to numbers
+    interactions_df['rating'] = pd.to_numeric(interactions_df['rating'], errors='coerce')
+    # Remove rows where rating conversion failed
+    interactions_df = interactions_df.dropna(subset=['rating'])
+    print(f"Valid interactions after cleaning: {len(interactions_df)}")
+    # Calculate statistics for each recipe
+    recipe_stats = {}
+    unique_recipe_ids = interactions_df['recipe_id'].unique()
+    for recipe_id in unique_recipe_ids:
+        # Get all interactions for this recipe
+        recipe_interactions = interactions_df[interactions_df['recipe_id'] == recipe_id]
+        # Calculate average rating
+        ratings_list = recipe_interactions['rating'].tolist()
+        average_rating = sum(ratings_list) / len(ratings_list)
+        # Count number of ratings
+        number_of_ratings = len(recipe_interactions)
+        # Count unique users
+        unique_users = recipe_interactions['user_id'].nunique()
+        recipe_stats[recipe_id] = (average_rating, number_of_ratings, unique_users)
+    print(f"Created statistics for {len(recipe_stats)} recipes")
+    return recipe_stats
+def create_recipe_embeddings(recipes_df, model, tokenizer, device, tag_categories, ingredient_groups):
+    print("Creating recipe embeddings (this will take a long time)")
+    recipe_embeddings_list = []
+    valid_recipes_list = []
+    # Process each recipe one by one
+    for i in range(len(recipes_df)):
+        recipe = recipes_df.iloc[i]
+        try:
+            # Create structured text for this recipe
+            recipe_text = create_structured_recipe_text(recipe, tag_categories, ingredient_groups)
+            # Tokenize the recipe text
+            tokenized_input = tokenizer(
+                recipe_text,
+                return_tensors='pt',
+                truncation=True,
+                max_length=128,
+                padding='max_length'
+            )
+            # Get embedding from model
+            with torch.no_grad():
+                tokenized_input = tokenized_input['input_ids'].to(device)
+                tokenized_mask = tokenized_input['attention_mask'].to(device)
+                model_outputs = model(tokenized_input, tokenized_mask)
+                # Get CLS token embedding (first token)
+                cls_embedding = model_outputs.last_hidden_state[:, 0, :]
+                # Move to CPU and convert to numpy
+                embedding_numpy = cls_embedding.cpu().numpy().flatten()
+            # Store the embedding and recipe
+            recipe_embeddings_list.append(embedding_numpy)
+            valid_recipes_list.append(recipe.copy())
+            # Show progress every 1000 recipes
+            if len(recipe_embeddings_list) % 1000 == 0:
+                print(f"Processed {len(recipe_embeddings_list)} recipes")
+        except Exception as e:
+            print(f"Error processing recipe {recipe.get('id', i)}: {e}")
+            continue
+    # Convert list to numpy array
+    embeddings_array = np.array(recipe_embeddings_list)
+    # Create new dataframe with only valid recipes
+    valid_recipes_df = pd.DataFrame(valid_recipes_list)
+    valid_recipes_df = valid_recipes_df.reset_index(drop=True)
+    print(f"Created {len(embeddings_array)} recipe embeddings")
+    return embeddings_array, valid_recipes_df
+def save_all_files(recipes_df, recipe_embeddings, recipe_stats):
+    print("Saving all files...")
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    np.save(f'recipe_embeddings_{timestamp}.npy', recipe_embeddings)
+    print(f"Saved embeddings")
+    # Save filtered recipes dataframe
+    with open(f'filtered_recipes_{timestamp}.pkl', 'wb') as f:
+        pickle.dump(recipes_df, f)
+    print(f"Saved recipes.")
+    # Save recipe statistics
+    with open(f'recipe_statistics_{timestamp}.pkl', 'wb') as f:
+        pickle.dump(recipe_stats, f)
+    print(f"Saved statistics")
+    print("All files saved successfully!")
+def create_all_necessary_files(recipes_path, interactions_path, model_path):
+    print("Starting full preprocessing pipeline")
+    # Set up device
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    print(f"Using device: {device}")
+    # Load tokenizer
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    # Load the trained model
+    model = BertModel.from_pretrained('bert-base-uncased')
+    model.load_state_dict(torch.load(model_path, map_location=device))
+    model.to(device)
+    model.eval()
+    # Set up tag categories and ingredient groups
+    tag_categories = setup_tag_categories()
+    ingredient_groups = setup_ingredient_groups()
+    # Load and clean recipes
+    recipes_df = load_and_clean_recipes(recipes_path)
+    # Create recipe statistics
+    recipe_stats = create_recipe_statistics(interactions_path)
+    # Create recipe embeddings
+    recipe_embeddings, filtered_recipes_df = create_recipe_embeddings(
+        recipes_df, model, tokenizer, device, tag_categories, ingredient_groups
+    )
+    # Save all files
+    save_all_files(filtered_recipes_df, recipe_embeddings, recipe_stats)
+if __name__ == "__main__":
+    create_all_necessary_files(
+        recipes_path='RAW_recipes.csv',
+        interactions_path='RAW_interactions.csv',
+        model_path='tag_based_bert_model.pth'
+    )
+    print("All preprocessing complete! You can now use the search system.")

scripts/NLP/search_script.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import torch
+import numpy as np
+from transformers import BertTokenizer, BertModel
+import pickle
+import json
+class RecipeSearchSystem:
+    def __init__(self, model_path='tag_based_bert_model.pth', max_recipes=231630):
+        # Set up device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load tokenizer
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        # Load the trained model
+        self.model = BertModel.from_pretrained('bert-base-uncased')
+        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
+        self.model.to(self.device)
+        self.model.eval()
+        # Load all the preprocessed files
+        self.max_recipes = max_recipes
+        #load recipe embeddings
+        self.recipe_embeddings = np.load(f'advanced_recipe_embeddings_{self.max_recipes}.npy')
+        #load recipes dataframe
+        with open(f'advanced_filtered_recipes_{self.max_recipes}.pkl', 'rb') as f:
+            self.recipes_df = pickle.load(f)
+        #load recipe statistics
+        with open(f'recipe_statistics_{self.max_recipes}.pkl', 'rb') as f:
+            self.recipe_stats = pickle.load(f)
+    def create_query_embedding(self, user_query):
+        structured_query = f"anchor: {user_query.lower()}"
+        # Tokenize the query
+        tokenized_query = self.tokenizer(
+            structured_query,
+            return_tensors='pt',
+            truncation=True,
+            max_length=128,
+            padding='max_length'
+        )
+        # Move to device
+        tokenized_query = tokenized_query.to(self.device)
+        # Get embedding from model
+        with torch.no_grad():
+            anchor_input_ids = tokenized_query['input_ids'].to(self.device)
+            anchor_attention_mask = tokenized_query['attention_mask'].to(self.device)
+            anchor_outputs = self.model(anchor_input_ids, anchor_attention_mask)
+            # Get CLS token embedding
+            anchor_embedding = anchor_outputs.last_hidden_state[:, 0, :]
+            # Move to CPU and convert to numpy
+            query_embedding_numpy = anchor_embedding.cpu().numpy().flatten()
+        return query_embedding_numpy
+    def calculate_similarities(self, query_embedding):
+        similarities = []
+        # Calculate cosine similarity for each recipe
+        for i in range(len(self.recipe_embeddings)):
+            recipe_embedding = self.recipe_embeddings[i]
+            # Calculate cosine similarity
+            #Cosine Similarity = (a · b) / (||a|| * ||b||)
+            dot_product = np.dot(recipe_embedding, query_embedding)
+            recipe_norm = np.linalg.norm(recipe_embedding)
+            query_norm = np.linalg.norm(query_embedding)
+            # Avoid division by zero
+            if recipe_norm > 0 and query_norm > 0:
+                similarity = dot_product / (recipe_norm * query_norm)
+            else:
+                similarity = 0.0
+            similarities.append(similarity)
+        return similarities
+    def filter_recipes_by_quality(self, min_rating=3.0, min_num_ratings=5):
+        #Get all indexes for recipes that meet the quality criteria the user chose
+        filtered_recipe_indices = []
+        for i in range(len(self.recipes_df)):
+            recipe = self.recipes_df.iloc[i]
+            recipe_id = recipe['id']
+            if recipe_id in self.recipe_stats:
+                avg_rating, num_ratings, _ = self.recipe_stats[recipe_id]
+                if avg_rating >= min_rating and num_ratings >= min_num_ratings:
+                    filtered_recipe_indices.append(i)
+        return filtered_recipe_indices
+    def rank_recipes_by_similarity_and_rating(self, similarities, recipe_indices):
+        recipe_scores = []
+        for recipe_index in recipe_indices:
+            recipe = self.recipes_df.iloc[recipe_index]
+            recipe_id = recipe['id']
+            semantic_score = similarities[recipe_index]
+            #if the recipe has no ratings we will assume it is a bad recipe to choose and set the ratio to 1.0
+            if recipe_id in self.recipe_stats:
+                avg_rating, _, _ = self.recipe_stats[recipe_id]
+            else:
+                avg_rating = 1.0
+            recipe_scores.append({
+                'recipe_index': recipe_index,
+                'recipe_id': recipe_id,
+                'semantic_score': semantic_score,
+                'avg_rating': avg_rating
+            })
+        return recipe_scores
+    def create_recipe_result(self, recipe_index, scores_info):
+        recipe = self.recipes_df.iloc[recipe_index]
+        recipe_id = recipe['id']
+        avg_rating, num_ratings, unique_users = self.recipe_stats[recipe_id]
+        # Create result structure mapping
+        result = {
+            'recipe_id': int(recipe_id),
+            'name': recipe['name'],
+            'ingredients': recipe['ingredients'],
+            'tags': recipe['tags'],
+            'minutes': int(recipe['minutes']),
+            'n_steps': int(recipe['n_steps']),
+            'description': recipe.get('description', ''),
+            'semantic_score': float(scores_info['semantic_score']),
+            'avg_rating': float(avg_rating),
+            'num_ratings': int(num_ratings),
+            'unique_users': int(unique_users)
+        }
+        result = json.dumps(result)
+        return result
+    def search_recipes(self, user_query, top_k=5, min_rating=3.0, min_num_ratings=5):
+        # Create embedding for user query
+        query_embedding = self.create_query_embedding(user_query)
+        # Calculate similarities between query and all recipes
+        similarities = self.calculate_similarities(query_embedding)
+        # Filter recipes by quality
+        filtered_recipe_indices = self.filter_recipes_by_quality(min_rating, min_num_ratings)
+        # Rank by semantic similarity and rating
+        recipe_scores = self.rank_recipes_by_similarity_and_rating(similarities, filtered_recipe_indices)
+        # Sort by semantic similarity, then by average rating
+        recipe_scores.sort(key=lambda x: (x['semantic_score'], x['avg_rating']), reverse=True)
+        # Get top results
+        top_results = recipe_scores[:top_k]
+        # Create result dictionaries
+        final_results = []
+        for score_info in top_results:
+            recipe_result = self.create_recipe_result(score_info['recipe_index'], score_info)
+            final_results.append(recipe_result)
+        return final_results
+def search_for_recipes(user_query, top_k=5, min_rating=3.0, min_num_ratings=5):
+    search_system = RecipeSearchSystem()
+    results = search_system.search_recipes(
+        user_query=user_query,
+        top_k=top_k,
+        min_rating=min_rating,
+        min_num_ratings=min_num_ratings
+    )
+    return results
+if __name__ == "__main__":
+    search_system = RecipeSearchSystem()
+    test_queries = [
+        # "chicken pasta italian quick dinner",
+        # "chocolate cake dessert brownie baked healthy",
+        # "healthy vegetarian salad tomato basil",
+        # "quick easy dinner",
+        # "beef steak",
+        "beef pasta",
+        "beef"
+    ]
+    for query in test_queries:
+        print(f"Testing query: '{query}'")
+        results = search_system.search_recipes(
+            user_query=query,
+            top_k=3,
+            min_rating=3.5,
+            min_num_ratings=10
+        )
+        print (results)
+    print("Recipe search system testing complete!")

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/__init__.py ADDED Viewed

File without changes

utils/layout.py ADDED Viewed

	@@ -0,0 +1,33 @@

+# layout.py
+import streamlit as st
+def set_custom_page_config():
+    st.set_page_config(
+        page_title="Smart Kitchen Assistant",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+def render_header():
+    st.markdown("""
+        <div class="project-header">
+            <h1>Smart Kitchen Assistant</h1>
+            <p>CSE555 Final Project — Group 5: Saksham & Ahmed</p>
+        </div>
+    """, unsafe_allow_html=True)
+def render_footer():
+    st.markdown("""
+        <div class="footer">
+            <p>Made with ❤️ by Saksham & Ahmed | CSE555 @ UB</p>
+        </div>
+    """, unsafe_allow_html=True)
+def render_layout(content_function):
+    set_custom_page_config()
+    with open("assets/css/styles.css") as f:
+        st.markdown(f"<style>{f.read()}</style>", unsafe_allow_html=True)
+    render_header()
+    content_function()
+    render_footer()