Spaces:
Sleeping
Sleeping
azaher1215
commited on
Commit
Β·
b8416be
1
Parent(s):
1ba150b
integrating md file with report.py
Browse files- pages/4_Report.py +192 -88
pages/4_Report.py
CHANGED
@@ -1,107 +1,211 @@
|
|
1 |
import streamlit as st
|
2 |
|
3 |
def render_report():
|
4 |
-
st.title("
|
5 |
-
|
|
|
6 |
st.markdown("""
|
7 |
-
|
8 |
-
|
|
|
9 |
""")
|
10 |
-
|
11 |
-
|
12 |
-
st.
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
""")
|
|
|
|
|
|
|
15 |
st.markdown("""
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
""")
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
st.latex(r"""
|
21 |
\text{Score}_i = 0.6 \times \text{Similarity}_i + 0.4 \times \text{Popularity}_i
|
22 |
""")
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
st.markdown("""
|
26 |
-
|
27 |
-
|
28 |
-
- **Ingredients per Recipe:** 3 to 20
|
29 |
-
- **Ratings Data:** Extracted from user interaction dataset
|
30 |
""")
|
31 |
-
|
32 |
-
st.
|
33 |
st.markdown("""
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
""")
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
st.markdown("---")
|
42 |
-
st.markdown("Β© 2025
|
43 |
|
44 |
-
#
|
45 |
render_report()
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
# LaTeX content as string
|
50 |
-
latex_report = r"""
|
51 |
-
\documentclass{article}
|
52 |
-
\usepackage{amsmath}
|
53 |
-
\usepackage{geometry}
|
54 |
-
\geometry{margin=1in}
|
55 |
-
\title{Recipe Recommendation System Report}
|
56 |
-
\author{Saksham Lakhera}
|
57 |
-
\date{\today}
|
58 |
-
|
59 |
-
\begin{document}
|
60 |
-
\maketitle
|
61 |
-
|
62 |
-
\section*{Overview}
|
63 |
-
This report summarizes the working of the \textbf{custom BERT-based Recipe Recommendation System}, dataset characteristics, scoring algorithm, and evaluation metrics.
|
64 |
-
|
65 |
-
\section*{Query Embedding and Similarity Calculation}
|
66 |
-
\[
|
67 |
-
\text{Similarity}(q, r_i) = \cos(\hat{q}, \hat{r}_i) = \frac{\hat{q} \cdot \hat{r}_i}{\|\hat{q}\|\|\hat{r}_i\|}
|
68 |
-
\]
|
69 |
-
Here, $\hat{q}$ is the BERT embedding of the query, and $\hat{r}_i$ is the embedding of the i-th recipe.
|
70 |
-
|
71 |
-
\section*{Final Score Calculation}
|
72 |
-
\[
|
73 |
-
\text{Score}_i = 0.6 \times \text{Similarity}_i + 0.4 \times \text{Popularity}_i
|
74 |
-
\]
|
75 |
-
|
76 |
-
\section*{Dataset Summary}
|
77 |
-
\begin{itemize}
|
78 |
-
\item \textbf{Total Recipes:} 231,630
|
79 |
-
\item \textbf{Average Tags per Recipe:} $\sim$6
|
80 |
-
\item \textbf{Ingredients per Recipe:} 3 to 20
|
81 |
-
\item \textbf{Ratings Source:} User interaction dataset
|
82 |
-
\end{itemize}
|
83 |
-
|
84 |
-
\section*{Evaluation Strategy}
|
85 |
-
We use a combination of:
|
86 |
-
\begin{itemize}
|
87 |
-
\item Manual inspection
|
88 |
-
\item Recipe diversity analysis
|
89 |
-
\item Match vs rating correlation
|
90 |
-
\item Qualitative user feedback
|
91 |
-
\end{itemize}
|
92 |
-
|
93 |
-
\end{document}
|
94 |
-
"""
|
95 |
-
|
96 |
-
# β¬οΈ Download button to get the .tex file
|
97 |
-
st.markdown("### π₯ Download LaTeX Report")
|
98 |
-
st.download_button(
|
99 |
-
label="Download LaTeX (.tex)",
|
100 |
-
data=latex_report,
|
101 |
-
file_name="recipe_report.tex",
|
102 |
-
mime="text/plain"
|
103 |
-
)
|
104 |
-
|
105 |
-
# π€ Optional: Show the .tex content in the app
|
106 |
-
with st.expander("π View LaTeX (.tex) File Content"):
|
107 |
-
st.code(latex_report, language="latex")
|
|
|
1 |
import streamlit as st
|
2 |
|
3 |
def render_report():
|
4 |
+
st.title("Group 5: Term Project Report")
|
5 |
+
|
6 |
+
# Title Page Information
|
7 |
st.markdown("""
|
8 |
+
**Course:** CSE 555 β Introduction to Pattern Recognition
|
9 |
+
**Authors:** Saksham Lakhera and Ahmed Zaher
|
10 |
+
**Date:** July 2025
|
11 |
""")
|
12 |
+
|
13 |
+
# Abstract
|
14 |
+
st.header("Abstract")
|
15 |
+
|
16 |
+
st.subheader("NLP Engineering Perspective")
|
17 |
+
st.markdown("""
|
18 |
+
This project addresses the challenge of improving recipe recommendation systems through
|
19 |
+
advanced semantic search capabilities using transformer-based language models. Traditional
|
20 |
+
keyword-based search methods often fail to capture the nuanced relationships between
|
21 |
+
ingredients, cooking techniques, and user preferences in culinary contexts.
|
22 |
+
|
23 |
+
Our approach leverages BERT (Bidirectional Encoder Representations from Transformers)
|
24 |
+
fine-tuning on a custom recipe dataset to develop a semantic understanding of culinary content.
|
25 |
+
We preprocessed and structured a subset of 15,000 recipes into standardized sequences organized
|
26 |
+
by food categories (proteins, vegetables, legumes, etc.) to create training data optimized for
|
27 |
+
the BERT architecture.
|
28 |
+
|
29 |
+
The model was fine-tuned to learn contextual embeddings that capture semantic relationships
|
30 |
+
between ingredients and tags. At inference time we generate embeddings for all recipes in our
|
31 |
+
dataset and perform cosine-similarity retrieval to produce the top-K most relevant recipes
|
32 |
+
for a user query.
|
33 |
""")
|
34 |
+
|
35 |
+
# Introduction
|
36 |
+
st.header("Introduction")
|
37 |
st.markdown("""
|
38 |
+
This term project serves primarily as an educational exercise aimed at giving students
|
39 |
+
end-to-end exposure to building a modern NLP system. Our goal is to construct a semantic
|
40 |
+
recipe-search engine that demonstrates how domain-specific fine-tuning of BERT can
|
41 |
+
substantially improve retrieval quality over simple keyword matching.
|
42 |
+
|
43 |
+
**Key Contributions:**
|
44 |
+
- A cleaned, category-labelled recipe subset of 15,000 recipes
|
45 |
+
- Training scripts that yield domain-adapted contextual embeddings
|
46 |
+
- A production-ready retrieval service that returns top-K most relevant recipes
|
47 |
+
- Comparative evaluation against classical baselines
|
48 |
""")
|
49 |
+
|
50 |
+
# Dataset and Preprocessing
|
51 |
+
st.header("Dataset and Pre-processing")
|
52 |
+
|
53 |
+
st.subheader("Data Sources")
|
54 |
+
st.markdown("""
|
55 |
+
The project draws from two CSV files:
|
56 |
+
- **Raw_recipes.csv** β 231,637 rows, one per recipe with columns: *id, name, ingredients, tags, minutes, steps, description, n_steps, n_ingredients*
|
57 |
+
- **Raw_interactions.csv** β user feedback containing *recipe_id, user_id, rating (1-5), review text*
|
58 |
+
""")
|
59 |
+
|
60 |
+
st.subheader("Corpus Filtering and Subset Selection")
|
61 |
+
st.markdown("""
|
62 |
+
1. **Invalid rows removed** β recipes with empty ingredient lists, missing tags, or fewer than three total tags
|
63 |
+
2. **Random sampling** β 15,000 recipes selected for NLP fine-tuning
|
64 |
+
3. **Positive/negative pairs** β generated for contrastive learning using ratings and tag similarity
|
65 |
+
4. **Train/test split** β 80/20 stratified split (12,000/3,000 pairs)
|
66 |
+
""")
|
67 |
+
|
68 |
+
st.subheader("Text Pre-processing Pipeline")
|
69 |
+
st.markdown("""
|
70 |
+
- **Lower-casing & punctuation removal** β normalized to lowercase, special characters stripped
|
71 |
+
- **Stop-descriptor removal** β culinary modifiers (*fresh, chopped, minced*) and measurements removed
|
72 |
+
- **Ingredient ordering** β re-ordered into sequence: **protein β vegetables β grains β dairy β other**
|
73 |
+
- **Tag normalization** β mapped to six canonical slots: *cuisine, course, main-ingredient, dietary, difficulty, occasion*
|
74 |
+
- **Tokenization** β standard *bert-base-uncased* WordPiece tokenizer, sequences truncated/padded to 128 tokens
|
75 |
+
""")
|
76 |
+
|
77 |
+
# Methodology
|
78 |
+
st.header("Methodology")
|
79 |
+
|
80 |
+
st.subheader("Model Architecture")
|
81 |
+
st.markdown("""
|
82 |
+
- **Base Model:** `bert-base-uncased` checkpoint
|
83 |
+
- **Additional Layers:** Single linear classification layer (768 β 1) with dropout (p = 0.1)
|
84 |
+
- **Training Objective:** Triplet-margin loss with margin of 1.0
|
85 |
+
""")
|
86 |
+
|
87 |
+
st.subheader("Hyperparameters")
|
88 |
+
col1, col2 = st.columns(2)
|
89 |
+
with col1:
|
90 |
+
st.markdown("""
|
91 |
+
- **Batch size:** 8
|
92 |
+
- **Max sequence length:** 128 tokens
|
93 |
+
- **Learning rate:** 2 Γ 10β»β΅
|
94 |
+
- **Weight decay:** 0.01
|
95 |
+
""")
|
96 |
+
with col2:
|
97 |
+
st.markdown("""
|
98 |
+
- **Optimizer:** AdamW
|
99 |
+
- **Epochs:** 3
|
100 |
+
- **Hardware:** Google Colab A100 GPU (40 GB VRAM)
|
101 |
+
- **Training time:** ~75 minutes per run
|
102 |
+
""")
|
103 |
+
|
104 |
+
# Mathematical Formulations
|
105 |
+
st.header("Mathematical Formulations")
|
106 |
+
|
107 |
+
st.subheader("Query Embedding and Similarity Calculation")
|
108 |
+
st.latex(r"""
|
109 |
+
\text{Similarity}(q, r_i) = \cos(\hat{q}, \hat{r}_i) = \frac{\hat{q} \cdot \hat{r}_i}{\|\hat{q}\|\|\hat{r}_i\|}
|
110 |
+
""")
|
111 |
+
st.markdown("Where $\\hat{q}$ is the BERT embedding of the query, and $\\hat{r}_i$ is the embedding of the i-th recipe.")
|
112 |
+
|
113 |
+
st.subheader("Final Score Calculation")
|
114 |
st.latex(r"""
|
115 |
\text{Score}_i = 0.6 \times \text{Similarity}_i + 0.4 \times \text{Popularity}_i
|
116 |
""")
|
117 |
+
|
118 |
+
# Results
|
119 |
+
st.header("Results")
|
120 |
+
|
121 |
+
st.subheader("Training and Validation Loss")
|
122 |
+
results_data = {
|
123 |
+
"Run": [1, 2, 3, 4],
|
124 |
+
"Configuration": [
|
125 |
+
"Raw, no cleaning/ordering",
|
126 |
+
"Cleaned text, unordered",
|
127 |
+
"Cleaned text + dropout",
|
128 |
+
"Cleaned text + dropout + ordering"
|
129 |
+
],
|
130 |
+
"Epoch-3 Train Loss": [0.0065, 0.0023, 0.0061, 0.0119],
|
131 |
+
"Validation Loss": [0.1100, 0.0000, 0.0118, 0.0067]
|
132 |
+
}
|
133 |
+
st.table(results_data)
|
134 |
+
|
135 |
st.markdown("""
|
136 |
+
**Key Finding:** Run 4 (cleaned text + dropout + ordering) achieved the best balance
|
137 |
+
between low validation loss and meaningful retrieval quality.
|
|
|
|
|
138 |
""")
|
139 |
+
|
140 |
+
st.subheader("Qualitative Retrieval Examples")
|
141 |
st.markdown("""
|
142 |
+
**Query: "beef steak dinner"**
|
143 |
+
- Run 1 (Raw): *to die for crock pot roast*, *crock pot chicken with black beans*
|
144 |
+
- Run 4 (Final): *grilled garlic steak dinner*, *classic beef steak au poivre*
|
145 |
+
|
146 |
+
**Query: "chicken italian pasta"**
|
147 |
+
- Run 1 (Raw): *to die for crock pot roast*, *crock pot chicken with black beans*
|
148 |
+
- Run 4 (Final): *creamy tuscan chicken pasta*, *italian chicken penne bake*
|
149 |
+
|
150 |
+
**Query: "vegetarian salad healthy"**
|
151 |
+
- Run 1 (Raw): (irrelevant hits)
|
152 |
+
- Run 4 (Final): *kale quinoa power salad*, *superfood spinach & berry salad*
|
153 |
""")
|
154 |
+
|
155 |
+
# Discussion and Conclusion
|
156 |
+
st.header("Discussion and Conclusion")
|
157 |
+
st.markdown("""
|
158 |
+
The experimental evidence underscores the importance of disciplined pre-processing when
|
159 |
+
adapting large language models to niche domains. The breakthrough came with **ingredient-ordering**
|
160 |
+
(protein β vegetables β grains β dairy β other) which supplied consistent positional signals.
|
161 |
+
|
162 |
+
**Key Achievements:**
|
163 |
+
- End-to-end recipe recommendation system with semantic search
|
164 |
+
- Sub-second latency across 231k recipes
|
165 |
+
- Meaningful semantic understanding of culinary content
|
166 |
+
- Reproducible blueprint for domain-specific NLP applications
|
167 |
+
|
168 |
+
**Limitations:**
|
169 |
+
- Private dataset relatively small (15k samples) compared to public corpora
|
170 |
+
- Minimal hyperparameter search conducted
|
171 |
+
- Single-machine deployment tested
|
172 |
+
""")
|
173 |
+
|
174 |
+
# Technical Specifications
|
175 |
+
st.header("Technical Specifications")
|
176 |
+
col1, col2 = st.columns(2)
|
177 |
+
with col1:
|
178 |
+
st.markdown("""
|
179 |
+
**Dataset:**
|
180 |
+
- Total Recipes: 231,630
|
181 |
+
- Training Set: 15,000 recipes
|
182 |
+
- Average Tags per Recipe: ~6
|
183 |
+
- Ingredients per Recipe: 3-20
|
184 |
+
""")
|
185 |
+
with col2:
|
186 |
+
st.markdown("""
|
187 |
+
**Infrastructure:**
|
188 |
+
- Python 3.10
|
189 |
+
- PyTorch 2.1 (CUDA 11.8)
|
190 |
+
- Transformers 4.38
|
191 |
+
- Google Colab A100 GPU
|
192 |
+
""")
|
193 |
+
|
194 |
+
# References
|
195 |
+
st.header("References")
|
196 |
+
st.markdown("""
|
197 |
+
[1] Vaswani et al., "Attention Is All You Need," NeurIPS, 2017.
|
198 |
+
|
199 |
+
[2] Devlin et al., "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding," NAACL-HLT, 2019.
|
200 |
+
|
201 |
+
[3] Reimers and Gurevych, "Sentence-BERT: Sentence Embeddings Using Siamese BERT-Networks," EMNLP-IJCNLP, 2019.
|
202 |
+
|
203 |
+
[4] Hugging Face, "BERT Model Documentation," 2024.
|
204 |
+
""")
|
205 |
+
|
206 |
st.markdown("---")
|
207 |
+
st.markdown("Β© 2025 CSE 555 Term Project. All rights reserved.")
|
208 |
|
209 |
+
# Render the report
|
210 |
render_report()
|
211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|