Spaces:
Sleeping
Sleeping
azaher1215
commited on
Commit
Β·
05b9293
1
Parent(s):
a306fec
final report additions
Browse files- Home.py +8 -10
- model/search_script.py +1 -1
- pages/3_Recipe_Recommendation.py +18 -16
- pages/4_Report.py +103 -83
- utils/layout.py +4 -4
Home.py
CHANGED
@@ -9,16 +9,14 @@ render_header()
|
|
9 |
|
10 |
st.markdown("""
|
11 |
<div class="about-box">
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
- Detecting their variations (cut, whole, sliced).
|
17 |
-
- Recommending recipes based on natural language input.
|
18 |
</div>
|
19 |
|
20 |
-
###
|
21 |
-
-
|
22 |
-
-
|
23 |
-
-
|
24 |
""", unsafe_allow_html=True)
|
|
|
9 |
|
10 |
st.markdown("""
|
11 |
<div class="about-box">
|
12 |
+
This tool leverages AI to assist in:<br>
|
13 |
+
- Classifying images of vegetables and fruits.<br>
|
14 |
+
- Detecting their variations (cut, whole, sliced).<br>
|
15 |
+
- Recommending recipes based on natural language input.<br>
|
|
|
|
|
16 |
</div>
|
17 |
|
18 |
+
### Use the left sidebar to navigate between:
|
19 |
+
- Task A: Classification - upload an image of a vegetable or fruit to classify it.
|
20 |
+
- Task B: Variation Detection - upload an image of a vegetable or fruit to detect its variation.
|
21 |
+
- NLP Recipe Recommendation - enter a search query to recommend a recipe.
|
22 |
""", unsafe_allow_html=True)
|
model/search_script.py
CHANGED
@@ -228,7 +228,7 @@ if __name__ == "__main__":
|
|
228 |
# "chocolate cake dessert brownie baked healthy",
|
229 |
# "healthy vegetarian salad tomato basil",
|
230 |
# "quick easy dinner",
|
231 |
-
|
232 |
"beef pasta",
|
233 |
"beef"
|
234 |
]
|
|
|
228 |
# "chocolate cake dessert brownie baked healthy",
|
229 |
# "healthy vegetarian salad tomato basil",
|
230 |
# "quick easy dinner",
|
231 |
+
"beef steak",
|
232 |
"beef pasta",
|
233 |
"beef"
|
234 |
]
|
pages/3_Recipe_Recommendation.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
from utils.layout import render_layout
|
2 |
import streamlit as st
|
3 |
import time
|
4 |
-
from model.search_script import search_for_recipes
|
5 |
import streamlit.components.v1 as components
|
6 |
|
7 |
def recipe_search_page():
|
@@ -9,7 +9,7 @@ def recipe_search_page():
|
|
9 |
## Advanced Recipe Recommendation
|
10 |
<div class="about-box">
|
11 |
This module uses a custom-trained BERT model to semantically search recipes
|
12 |
-
based on your query
|
13 |
</div>
|
14 |
""", unsafe_allow_html=True)
|
15 |
|
@@ -25,7 +25,7 @@ def recipe_search_page():
|
|
25 |
|
26 |
query = st.text_input(
|
27 |
"Search for recipes:",
|
28 |
-
placeholder="e.g., 'chicken pasta', 'vegetarian salad', 'chocolate dessert', 'quick easy "
|
29 |
)
|
30 |
|
31 |
col1, col2 = st.columns(2)
|
@@ -50,36 +50,38 @@ def recipe_search_page():
|
|
50 |
description = recipe.get("description", "").strip().capitalize()
|
51 |
|
52 |
html_code = f"""
|
53 |
-
<div style
|
54 |
-
<div style
|
55 |
|
56 |
-
<div style
|
57 |
<b>{recipe['minutes']} min</b> | <b>{recipe['n_steps']} steps</b> | <b>{recipe['avg_rating']:.1f}/5.0</b>
|
58 |
-
<span style
|
59 |
</div>
|
60 |
|
61 |
-
<div style
|
62 |
-
<b>Match Score:</b> <span style
|
63 |
-
<span style
|
64 |
</div>
|
65 |
|
66 |
-
<div style
|
67 |
<b>Tags:</b><br>
|
68 |
-
|
|
|
|
|
69 |
</div>
|
70 |
|
71 |
-
<div style
|
72 |
<b>Ingredients:</b><br>
|
73 |
-
<span style
|
74 |
{'...' if len(recipe['ingredients']) > 8 else ''}</span>
|
75 |
</div>
|
76 |
|
77 |
-
{"<div style='margin-top: 10px; font-size: 13px; color: #333;'><b>Description:</b><br>" + description + "</div>" if description else ""}
|
78 |
|
79 |
{"<div style='margin-top: 10px; font-size: 13px;'><b>Steps:</b><ol style='margin: 6px 0 0 18px; padding: 0;'>" + steps_html + "</ol></div>" if steps_html else ""}
|
80 |
</div>
|
81 |
"""
|
82 |
-
components.html(html_code, height=
|
83 |
|
84 |
else:
|
85 |
st.warning(f"No recipes found for '{query}' with a minimum rating of {min_rating}/5.0.")
|
|
|
1 |
from utils.layout import render_layout
|
2 |
import streamlit as st
|
3 |
import time
|
4 |
+
from model.search_script import search_for_recipes
|
5 |
import streamlit.components.v1 as components
|
6 |
|
7 |
def recipe_search_page():
|
|
|
9 |
## Advanced Recipe Recommendation
|
10 |
<div class="about-box">
|
11 |
This module uses a custom-trained BERT model to semantically search recipes
|
12 |
+
based on your query of ingredients and tags.
|
13 |
</div>
|
14 |
""", unsafe_allow_html=True)
|
15 |
|
|
|
25 |
|
26 |
query = st.text_input(
|
27 |
"Search for recipes:",
|
28 |
+
placeholder="e.g., 'chicken pasta italian', 'vegetarian salad', 'chocolate dessert', 'quick easy' "
|
29 |
)
|
30 |
|
31 |
col1, col2 = st.columns(2)
|
|
|
50 |
description = recipe.get("description", "").strip().capitalize()
|
51 |
|
52 |
html_code = f"""
|
53 |
+
<div style=\"margin: 8px 0 8px 0; padding: 8px; border-radius: 12px; background-color: #fdfdfd; box-shadow: 0 2px 8px rgba(0,0,0,0.06); font-family: Arial, sans-serif; border: 1px solid #e0e0e0;\">
|
54 |
+
<div style=\"font-size: 18px; font-weight: bold; color: #333; margin-bottom: 8px;\"> {i}. {recipe['name']}</div>
|
55 |
|
56 |
+
<div style=\"margin: 4px 0 12px 0; font-size: 14px; color: #555;\">
|
57 |
<b>{recipe['minutes']} min</b> | <b>{recipe['n_steps']} steps</b> | <b>{recipe['avg_rating']:.1f}/5.0</b>
|
58 |
+
<span style=\"font-size: 12px; color: #999;\">({recipe['num_ratings']} ratings)</span>
|
59 |
</div>
|
60 |
|
61 |
+
<div style=\"margin-bottom: 8px; font-size: 14px;\">
|
62 |
+
<b>Match Score:</b> <span style=\"color: #007acc; font-weight: bold;\">{recipe['similarity_score']:.1%}</span>
|
63 |
+
<span style=\"font-size: 12px; color: #888;\">(query match)</span>
|
64 |
</div>
|
65 |
|
66 |
+
<div style=\"margin-bottom: 8px;\">
|
67 |
<b>Tags:</b><br>
|
68 |
+
<div style=\"margin-top: 8px;\">
|
69 |
+
{" ".join([f"<span style='background:#eee;padding:4px 8px;border-radius:6px;margin:2px;display:inline-block;font-size:12px'>{tag}</span>" for tag in recipe['tags']])}
|
70 |
+
</div>
|
71 |
</div>
|
72 |
|
73 |
+
<div style=\"margin-bottom: 8px;\">
|
74 |
<b>Ingredients:</b><br>
|
75 |
+
<span style=\"font-size: 13px; color: #444; margin-top: 4px; display: block;\">{', '.join(recipe['ingredients'][:8])}
|
76 |
{'...' if len(recipe['ingredients']) > 8 else ''}</span>
|
77 |
</div>
|
78 |
|
79 |
+
{"<div style='margin-top: 10px; font-size: 13px; color: #333;'><b>Description:</b><br><span style='margin-top: 4px; display: block;'>" + description + "</span></div>" if description else ""}
|
80 |
|
81 |
{"<div style='margin-top: 10px; font-size: 13px;'><b>Steps:</b><ol style='margin: 6px 0 0 18px; padding: 0;'>" + steps_html + "</ol></div>" if steps_html else ""}
|
82 |
</div>
|
83 |
"""
|
84 |
+
components.html(html_code, height=340, scrolling=True)
|
85 |
|
86 |
else:
|
87 |
st.warning(f"No recipes found for '{query}' with a minimum rating of {min_rating}/5.0.")
|
pages/4_Report.py
CHANGED
@@ -1,90 +1,124 @@
|
|
1 |
import streamlit as st
|
|
|
2 |
|
3 |
def render_report():
|
4 |
-
st.title("
|
5 |
|
6 |
# Title Page Information
|
7 |
st.markdown("""
|
8 |
-
**Course:** CSE 555 β Introduction to Pattern Recognition
|
9 |
**Authors:** Saksham Lakhera and Ahmed Zaher
|
10 |
**Date:** July 2025
|
11 |
""")
|
12 |
|
13 |
# Abstract
|
14 |
-
st.
|
15 |
|
16 |
-
st.subheader("NLP Engineering Perspective")
|
17 |
st.markdown("""
|
18 |
-
|
19 |
-
advanced semantic search capabilities using transformer-based language models. Traditional
|
20 |
-
keyword-based search methods often fail to capture the nuanced relationships between
|
21 |
-
ingredients, cooking techniques, and user preferences in culinary contexts.
|
22 |
|
|
|
|
|
|
|
|
|
23 |
Our approach leverages BERT (Bidirectional Encoder Representations from Transformers)
|
24 |
fine-tuning on a custom recipe dataset to develop a semantic understanding of culinary content.
|
25 |
We preprocessed and structured a subset of 15,000 recipes into standardized sequences organized
|
26 |
by food categories (proteins, vegetables, legumes, etc.) to create training data optimized for
|
27 |
the BERT architecture.
|
28 |
-
|
29 |
The model was fine-tuned to learn contextual embeddings that capture semantic relationships
|
30 |
-
between ingredients and tags. At
|
31 |
dataset and perform cosine-similarity retrieval to produce the top-K most relevant recipes
|
32 |
for a user query.
|
33 |
""")
|
34 |
|
35 |
# Introduction
|
36 |
-
st.
|
37 |
st.markdown("""
|
38 |
-
This term project serves primarily as an educational exercise aimed at giving
|
39 |
end-to-end exposure to building a modern NLP system. Our goal is to construct a semantic
|
40 |
recipe-search engine that demonstrates how domain-specific fine-tuning of BERT can
|
41 |
substantially improve retrieval quality over simple keyword matching.
|
42 |
|
43 |
**Key Contributions:**
|
44 |
- A cleaned, category-labelled recipe subset of 15,000 recipes
|
45 |
-
- Training scripts that yield
|
46 |
- A production-ready retrieval service that returns top-K most relevant recipes
|
47 |
- Comparative evaluation against classical baselines
|
48 |
""")
|
49 |
|
50 |
# Dataset and Preprocessing
|
51 |
-
st.
|
52 |
|
53 |
-
st.subheader("Data Sources")
|
54 |
st.markdown("""
|
|
|
|
|
55 |
The project draws from two CSV files:
|
56 |
-
- **Raw_recipes.csv
|
57 |
-
- **Raw_interactions.csv
|
58 |
""")
|
59 |
|
60 |
-
st.subheader("Corpus Filtering and Subset Selection")
|
61 |
st.markdown("""
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
66 |
""")
|
67 |
|
68 |
-
st.subheader("Text Pre-processing Pipeline")
|
69 |
st.markdown("""
|
70 |
-
|
71 |
-
|
72 |
-
- **
|
73 |
-
- **
|
74 |
-
- **
|
|
|
|
|
75 |
""")
|
76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
# Methodology
|
78 |
-
st.
|
79 |
|
80 |
-
st.subheader("Model Architecture")
|
81 |
st.markdown("""
|
82 |
-
|
83 |
-
|
84 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
""")
|
86 |
|
87 |
-
st.
|
88 |
col1, col2 = st.columns(2)
|
89 |
with col1:
|
90 |
st.markdown("""
|
@@ -98,108 +132,94 @@ def render_report():
|
|
98 |
- **Optimizer:** AdamW
|
99 |
- **Epochs:** 3
|
100 |
- **Hardware:** Google Colab A100 GPU (40 GB VRAM)
|
101 |
-
- **Training time:** ~
|
102 |
""")
|
103 |
|
104 |
# Mathematical Formulations
|
105 |
-
st.
|
106 |
|
107 |
-
st.
|
|
|
|
|
|
|
108 |
st.latex(r"""
|
109 |
\text{Similarity}(q, r_i) = \cos(\hat{q}, \hat{r}_i) = \frac{\hat{q} \cdot \hat{r}_i}{\|\hat{q}\|\|\hat{r}_i\|}
|
110 |
""")
|
111 |
st.markdown("Where $\\hat{q}$ is the BERT embedding of the query, and $\\hat{r}_i$ is the embedding of the i-th recipe.")
|
112 |
|
113 |
-
st.subheader("Final Score Calculation")
|
114 |
-
st.latex(r"""
|
115 |
-
\text{Score}_i = 0.6 \times \text{Similarity}_i + 0.4 \times \text{Popularity}_i
|
116 |
-
""")
|
117 |
|
118 |
# Results
|
119 |
-
st.
|
120 |
|
121 |
-
st.
|
122 |
results_data = {
|
123 |
"Run": [1, 2, 3, 4],
|
124 |
"Configuration": [
|
125 |
"Raw, no cleaning/ordering",
|
126 |
"Cleaned text, unordered",
|
127 |
-
"Cleaned text + dropout",
|
128 |
-
"Cleaned text +
|
129 |
],
|
130 |
"Epoch-3 Train Loss": [0.0065, 0.0023, 0.0061, 0.0119],
|
131 |
"Validation Loss": [0.1100, 0.0000, 0.0118, 0.0067]
|
132 |
}
|
133 |
st.table(results_data)
|
134 |
-
|
135 |
st.markdown("""
|
136 |
-
**Key Finding:** Run 4 (cleaned text +
|
137 |
between low validation loss and meaningful retrieval quality.
|
138 |
""")
|
139 |
|
140 |
-
st.
|
141 |
st.markdown("""
|
|
|
142 |
**Query: "beef steak dinner"**
|
143 |
- Run 1 (Raw): *to die for crock pot roast*, *crock pot chicken with black beans*
|
|
|
|
|
144 |
- Run 4 (Final): *grilled garlic steak dinner*, *classic beef steak au poivre*
|
145 |
|
146 |
**Query: "chicken italian pasta"**
|
147 |
- Run 1 (Raw): *to die for crock pot roast*, *crock pot chicken with black beans*
|
|
|
|
|
148 |
- Run 4 (Final): *creamy tuscan chicken pasta*, *italian chicken penne bake*
|
149 |
|
150 |
**Query: "vegetarian salad healthy"**
|
151 |
-
- Run 1 (Raw):
|
|
|
|
|
152 |
- Run 4 (Final): *kale quinoa power salad*, *superfood spinach & berry salad*
|
153 |
""")
|
154 |
|
155 |
# Discussion and Conclusion
|
156 |
-
st.
|
157 |
st.markdown("""
|
158 |
The experimental evidence underscores the importance of disciplined pre-processing when
|
159 |
-
adapting large language models to niche domains. The breakthrough came with
|
160 |
-
(protein β vegetables β grains β dairy β other) which supplied consistent positional signals.
|
|
|
|
|
161 |
|
162 |
**Key Achievements:**
|
163 |
- End-to-end recipe recommendation system with semantic search
|
164 |
-
- Sub-second latency across 231k recipes
|
165 |
- Meaningful semantic understanding of culinary content
|
166 |
- Reproducible blueprint for domain-specific NLP applications
|
167 |
|
168 |
**Limitations:**
|
169 |
-
- Private dataset relatively small (
|
|
|
170 |
- Minimal hyperparameter search conducted
|
171 |
- Single-machine deployment tested
|
|
|
172 |
""")
|
173 |
|
174 |
-
# Technical Specifications
|
175 |
-
st.header("Technical Specifications")
|
176 |
-
col1, col2 = st.columns(2)
|
177 |
-
with col1:
|
178 |
-
st.markdown("""
|
179 |
-
**Dataset:**
|
180 |
-
- Total Recipes: 231,630
|
181 |
-
- Training Set: 15,000 recipes
|
182 |
-
- Average Tags per Recipe: ~6
|
183 |
-
- Ingredients per Recipe: 3-20
|
184 |
-
""")
|
185 |
-
with col2:
|
186 |
-
st.markdown("""
|
187 |
-
**Infrastructure:**
|
188 |
-
- Python 3.10
|
189 |
-
- PyTorch 2.1 (CUDA 11.8)
|
190 |
-
- Transformers 4.38
|
191 |
-
- Google Colab A100 GPU
|
192 |
-
""")
|
193 |
-
|
194 |
# References
|
195 |
-
st.
|
196 |
st.markdown("""
|
197 |
-
[1] Vaswani et al., "Attention Is All You Need," NeurIPS, 2017.
|
198 |
-
|
199 |
-
[
|
200 |
-
|
201 |
-
[3] Reimers and Gurevych, "Sentence-BERT: Sentence Embeddings Using Siamese BERT-Networks," EMNLP-IJCNLP, 2019.
|
202 |
-
|
203 |
[4] Hugging Face, "BERT Model Documentation," 2024.
|
204 |
""")
|
205 |
|
@@ -207,5 +227,5 @@ def render_report():
|
|
207 |
st.markdown("Β© 2025 CSE 555 Term Project. All rights reserved.")
|
208 |
|
209 |
# Render the report
|
210 |
-
render_report
|
211 |
|
|
|
1 |
import streamlit as st
|
2 |
+
from utils.layout import render_layout
|
3 |
|
4 |
def render_report():
|
5 |
+
st.title("Image Classification CV and Fine-Tuned NLP Recipe Recommendation")
|
6 |
|
7 |
# Title Page Information
|
8 |
st.markdown("""
|
|
|
9 |
**Authors:** Saksham Lakhera and Ahmed Zaher
|
10 |
**Date:** July 2025
|
11 |
""")
|
12 |
|
13 |
# Abstract
|
14 |
+
st.subheader("Abstract")
|
15 |
|
|
|
16 |
st.markdown("""
|
17 |
+
**NLP Engineering Perspective:**
|
|
|
|
|
|
|
18 |
|
19 |
+
This project addresses the challenge of improving recipe recommendation systems through
|
20 |
+
advanced semantic search capabilities using transformer-based language models. This will explain how to fine-tune a model
|
21 |
+
to learn domain-specific context to capture the nuanced relationships between
|
22 |
+
ingredients and cooking techniques in culinary contexts.
|
23 |
Our approach leverages BERT (Bidirectional Encoder Representations from Transformers)
|
24 |
fine-tuning on a custom recipe dataset to develop a semantic understanding of culinary content.
|
25 |
We preprocessed and structured a subset of 15,000 recipes into standardized sequences organized
|
26 |
by food categories (proteins, vegetables, legumes, etc.) to create training data optimized for
|
27 |
the BERT architecture.
|
|
|
28 |
The model was fine-tuned to learn contextual embeddings that capture semantic relationships
|
29 |
+
between ingredients and tags. At the end, we generate embeddings for all recipes in our
|
30 |
dataset and perform cosine-similarity retrieval to produce the top-K most relevant recipes
|
31 |
for a user query.
|
32 |
""")
|
33 |
|
34 |
# Introduction
|
35 |
+
st.subheader("Introduction")
|
36 |
st.markdown("""
|
37 |
+
This term project serves primarily as an educational exercise aimed at giving
|
38 |
end-to-end exposure to building a modern NLP system. Our goal is to construct a semantic
|
39 |
recipe-search engine that demonstrates how domain-specific fine-tuning of BERT can
|
40 |
substantially improve retrieval quality over simple keyword matching.
|
41 |
|
42 |
**Key Contributions:**
|
43 |
- A cleaned, category-labelled recipe subset of 15,000 recipes
|
44 |
+
- Training scripts that yield adapted contextual embeddings
|
45 |
- A production-ready retrieval service that returns top-K most relevant recipes
|
46 |
- Comparative evaluation against classical baselines
|
47 |
""")
|
48 |
|
49 |
# Dataset and Preprocessing
|
50 |
+
st.subheader("Dataset and Pre-processing")
|
51 |
|
|
|
52 |
st.markdown("""
|
53 |
+
**Data Sources:**
|
54 |
+
|
55 |
The project draws from two CSV files:
|
56 |
+
- **Raw_recipes.csv:** 231,637 rows, one per recipe with columns: *id, name, ingredients, tags, minutes, steps, description, n_steps, n_ingredients*
|
57 |
+
- **Raw_interactions.csv:** user feedback containing *recipe_id, user_id, rating, review text*
|
58 |
""")
|
59 |
|
|
|
60 |
st.markdown("""
|
61 |
+
**Corpus Filtering and Subset Selection**
|
62 |
+
|
63 |
+
- **Invalid rows removed:** recipes with empty ingredient lists, missing tags, or fewer than three total tags
|
64 |
+
- **Random sampling:** 15,000 recipes selected for NLP fine-tuning
|
65 |
+
- **Positive/negative pairs:** generated for contrastive learning using ratings and tag similarity
|
66 |
+
- **Train/test split:** 80/20 stratified split (12,000/3,000 pairs)
|
67 |
""")
|
68 |
|
|
|
69 |
st.markdown("""
|
70 |
+
**Text Pre-processing Pipeline**
|
71 |
+
|
72 |
+
- **Lower-casing & punctuation removal:** normalized to lowercase, special characters stripped
|
73 |
+
- **Stop-descriptor removal:** culinary modifiers (*fresh, chopped, minced*) and measurements (tablespoons, teaspoons, cups, etc.) removed
|
74 |
+
- **Ingredient ordering:** re-ordered into sequence: protein β vegetables/grains/ dairy β other
|
75 |
+
- **Tag normalization:** mapped to 7 main categories: *cuisine, course, main-ingredient, dietary, difficulty, occasion, cooking_method*
|
76 |
+
- **Tokenization:** standard *bert-base-uncased* WordPiece tokenizer, sequences truncated/padded to 128 tokens
|
77 |
""")
|
78 |
+
# Technical Specifications
|
79 |
+
st.subheader("Technical Specifications")
|
80 |
+
col1, col2 = st.columns(2)
|
81 |
+
with col1:
|
82 |
+
st.markdown("""
|
83 |
+
**Dataset:**
|
84 |
+
- Total Recipes: 231,630
|
85 |
+
- Training Set: 12,000 recipes
|
86 |
+
- Average Tags per Recipe: ~6
|
87 |
+
- Ingredients per Recipe: 3-20
|
88 |
+
""")
|
89 |
+
with col2:
|
90 |
+
st.markdown("""
|
91 |
+
**Infrastructure:**
|
92 |
+
- Python 3.10
|
93 |
+
- PyTorch 2.1 (CUDA 11.8)
|
94 |
+
- Transformers 4.38
|
95 |
+
- Google Colab A100 GPU
|
96 |
+
""")
|
97 |
# Methodology
|
98 |
+
st.subheader("Methodology")
|
99 |
|
|
|
100 |
st.markdown("""
|
101 |
+
**Model Architecture**
|
102 |
+
|
103 |
+
- **Base Model:** bert-base-uncased
|
104 |
+
- **Additional Layers:** In some runs, we added a single linear classification layer with dropout (p = 0.1)
|
105 |
+
- **Training Objective:** Triplet-margin loss with margin of 1.0
|
106 |
+
|
107 |
+
We trained the model directly on the raw data to see if we will get any good results. As seen in table 1, this run resulted in a very low training error
|
108 |
+
but when ran on the validation set, the training error was higher. We then used cleaned up the data by removing any empty space, standardized to lower text, removed
|
109 |
+
all punctuation and retrained the model. This resulted in a highly overfitted model as seen in table 1 and the results section below. Next, we added a single linear layer on top of
|
110 |
+
the BERT's current architecture and added a dropout to get rid of overfitting. The results as shown in table 1 were better. Although the semantic
|
111 |
+
results were better than before, it still was not good in indentifying the relashionships between ingredients and the different tags. We then further
|
112 |
+
structured the data by ordering the tags and ingredients in a strcutured manner across the dataset and retrained the model. This resulted in a better
|
113 |
+
training and validation loss. This is also evident in the semantic retrieval results below.
|
114 |
+
|
115 |
+
**Website Development:**
|
116 |
+
- We used streamlit to develop the websit. However, we faced few issues with the size of the trained model and we switched hosting to Hugging Face.
|
117 |
+
- The website loades the pre-trained model along with recipes embeddings and top-k retrieval function and waits for the user to enter a query.
|
118 |
+
- The query is then processed b the model and top-k recipes are returned.
|
119 |
""")
|
120 |
|
121 |
+
st.markdown("**Hyperparameters and Training**")
|
122 |
col1, col2 = st.columns(2)
|
123 |
with col1:
|
124 |
st.markdown("""
|
|
|
132 |
- **Optimizer:** AdamW
|
133 |
- **Epochs:** 3
|
134 |
- **Hardware:** Google Colab A100 GPU (40 GB VRAM)
|
135 |
+
- **Training time:** ~30 minutes per run
|
136 |
""")
|
137 |
|
138 |
# Mathematical Formulations
|
139 |
+
st.subheader("Mathematical Formulations and Top-K Retrieval")
|
140 |
|
141 |
+
st.markdown("""**Query Embedding and Similarity Calculation**: we used the trained model weights to generate embeddings for the entire recipe corpus. We then used cosine similarity to calculate the similarity between the query and the recipe corpus.
|
142 |
+
and once the user query is passed, we embedded the querry using the trained model and used the cosine similarity formula below to retrieve the top-K
|
143 |
+
recipes. We then filtered the only ones that have an average rating >= 3.0 and at least 5 ratings. We then sorted the recipes by similarity and then by average rating.
|
144 |
+
""")
|
145 |
st.latex(r"""
|
146 |
\text{Similarity}(q, r_i) = \cos(\hat{q}, \hat{r}_i) = \frac{\hat{q} \cdot \hat{r}_i}{\|\hat{q}\|\|\hat{r}_i\|}
|
147 |
""")
|
148 |
st.markdown("Where $\\hat{q}$ is the BERT embedding of the query, and $\\hat{r}_i$ is the embedding of the i-th recipe.")
|
149 |
|
|
|
|
|
|
|
|
|
150 |
|
151 |
# Results
|
152 |
+
st.subheader("Results")
|
153 |
|
154 |
+
st.markdown("**Training and Validation Loss**")
|
155 |
results_data = {
|
156 |
"Run": [1, 2, 3, 4],
|
157 |
"Configuration": [
|
158 |
"Raw, no cleaning/ordering",
|
159 |
"Cleaned text, unordered",
|
160 |
+
"Cleaned text + single layer + dropout",
|
161 |
+
"Cleaned text + ordering"
|
162 |
],
|
163 |
"Epoch-3 Train Loss": [0.0065, 0.0023, 0.0061, 0.0119],
|
164 |
"Validation Loss": [0.1100, 0.0000, 0.0118, 0.0067]
|
165 |
}
|
166 |
st.table(results_data)
|
167 |
+
st.markdown("""Table 1: Training and Validation Loss for each run""")
|
168 |
st.markdown("""
|
169 |
+
**Key Finding:** Run 4 (cleaned text + ordering) achieved the best balance
|
170 |
between low validation loss and meaningful retrieval quality.
|
171 |
""")
|
172 |
|
173 |
+
st.markdown("**Qualitative Retrieval Examples**")
|
174 |
st.markdown("""
|
175 |
+
In this section, we will show how the results of the model differ between runs and how the model performs on different queries.
|
176 |
**Query: "beef steak dinner"**
|
177 |
- Run 1 (Raw): *to die for crock pot roast*, *crock pot chicken with black beans*
|
178 |
+
- Run 2 (Cleaned text, unordered): *aussie pepper steak steak with creamy pepper sauce*
|
179 |
+
- Run 3 (Cleaned text + single layer + dropout): *balsamic rib eye steak with bleu cheese sauce*
|
180 |
- Run 4 (Final): *grilled garlic steak dinner*, *classic beef steak au poivre*
|
181 |
|
182 |
**Query: "chicken italian pasta"**
|
183 |
- Run 1 (Raw): *to die for crock pot roast*, *crock pot chicken with black beans*
|
184 |
+
- Run 2 (Cleaned text, unordered): *baked chicken soup*
|
185 |
+
- Run 3 (Cleaned text + single layer + dropout): *absolute best ever lasagna*
|
186 |
- Run 4 (Final): *creamy tuscan chicken pasta*, *italian chicken penne bake*
|
187 |
|
188 |
**Query: "vegetarian salad healthy"**
|
189 |
+
- Run 1 (Raw): *to die for crock pot roast*
|
190 |
+
- Run 2 (Cleaned text, unordered): *avocado mandarin salad*
|
191 |
+
- Run 3 (Cleaned text + single layer + dropout): *black bean and sweet potato salad*
|
192 |
- Run 4 (Final): *kale quinoa power salad*, *superfood spinach & berry salad*
|
193 |
""")
|
194 |
|
195 |
# Discussion and Conclusion
|
196 |
+
st.subheader("Discussion and Conclusion")
|
197 |
st.markdown("""
|
198 |
The experimental evidence underscores the importance of disciplined pre-processing when
|
199 |
+
adapting large language models to niche domains. The breakthrough came with ingredient-ordering
|
200 |
+
(protein β vegetables β grains β dairy β other) which supplied consistent positional signals. As we can see in the results,
|
201 |
+
the performance of the model improves with the addition of the single layer and dropout but the results are still not as good as the final run where
|
202 |
+
we added the ordering of the ingredients.
|
203 |
|
204 |
**Key Achievements:**
|
205 |
- End-to-end recipe recommendation system with semantic search
|
|
|
206 |
- Meaningful semantic understanding of culinary content
|
207 |
- Reproducible blueprint for domain-specific NLP applications
|
208 |
|
209 |
**Limitations:**
|
210 |
+
- Private dataset relatively small training set (12k samples) compared to public corpora
|
211 |
+
- Further pre-processing could be done to improve the results
|
212 |
- Minimal hyperparameter search conducted
|
213 |
- Single-machine deployment tested
|
214 |
+
- The model is not able to handle complex queries and it is not able to handle synonyms and antonyms.
|
215 |
""")
|
216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
# References
|
218 |
+
st.subheader("References")
|
219 |
st.markdown("""
|
220 |
+
[1] Vaswani et al., "Attention Is All You Need," NeurIPS, 2017.
|
221 |
+
[2] Devlin et al., "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," NAACL-HLT, 2019.
|
222 |
+
[3] Reimers and Gurevych, "Sentence-BERT: Sentence Embeddings Using Siamese BERT-Networks," EMNLP-IJCNLP, 2019.
|
|
|
|
|
|
|
223 |
[4] Hugging Face, "BERT Model Documentation," 2024.
|
224 |
""")
|
225 |
|
|
|
227 |
st.markdown("Β© 2025 CSE 555 Term Project. All rights reserved.")
|
228 |
|
229 |
# Render the report
|
230 |
+
render_layout(render_report)
|
231 |
|
utils/layout.py
CHANGED
@@ -3,7 +3,7 @@ import streamlit as st
|
|
3 |
|
4 |
def set_custom_page_config():
|
5 |
st.set_page_config(
|
6 |
-
page_title="
|
7 |
layout="wide",
|
8 |
initial_sidebar_state="expanded"
|
9 |
)
|
@@ -11,15 +11,15 @@ def set_custom_page_config():
|
|
11 |
def render_header():
|
12 |
st.markdown("""
|
13 |
<div class="project-header">
|
14 |
-
<h1>
|
15 |
-
<p>CSE555 Final Project β Group 5: Saksham & Ahmed</p>
|
16 |
</div>
|
17 |
""", unsafe_allow_html=True)
|
18 |
|
19 |
def render_footer():
|
20 |
st.markdown("""
|
21 |
<div class="footer">
|
22 |
-
<p>Made with β€οΈ by Saksham &
|
23 |
</div>
|
24 |
""", unsafe_allow_html=True)
|
25 |
|
|
|
3 |
|
4 |
def set_custom_page_config():
|
5 |
st.set_page_config(
|
6 |
+
page_title="Computer Vision and Natural Language Processing Project",
|
7 |
layout="wide",
|
8 |
initial_sidebar_state="expanded"
|
9 |
)
|
|
|
11 |
def render_header():
|
12 |
st.markdown("""
|
13 |
<div class="project-header">
|
14 |
+
<h1>Computer Vision and Natural Language Processing Project</h1>
|
15 |
+
<p>CSE555 Final Project β Group 5: Saksham Lakhera & Ahmed Zaher</p>
|
16 |
</div>
|
17 |
""", unsafe_allow_html=True)
|
18 |
|
19 |
def render_footer():
|
20 |
st.markdown("""
|
21 |
<div class="footer">
|
22 |
+
<p>Made with β€οΈ by Saksham & Zaher | CSE555 @ UB</p>
|
23 |
</div>
|
24 |
""", unsafe_allow_html=True)
|
25 |
|