pattern / scripts /NLP /processing_files_for_app.py
sakshamlakhera
Initial commit
733fcd8
raw
history blame
14.9 kB
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from ast import literal_eval
import re
import pickle
from datetime import datetime
def clean_text(text):
#helper function to clean the text from whitespace, double spaces
# converts to lowercase and checks if the text is a string first to avoid errors
if not isinstance(text, str):
return ''
text = text.lower()
text = ' '.join(text.split())
return text.strip()
def setup_tag_categories():
tag_categories = {
'cuisine': [
'italian', 'chinese', 'mexican', 'indian', 'french', 'greek', 'thai',
'japanese', 'american', 'european', 'asian', 'mediterranean', 'spanish',
'german', 'korean', 'vietnamese', 'turkish', 'moroccan', 'lebanese'
],
'course': [
'main-dish', 'side-dishes', 'appetizers', 'desserts', 'breakfast',
'lunch', 'dinner', 'snacks', 'beverages', 'salads', 'soups'
],
'main_ingredient': [
'chicken', 'beef', 'pork', 'fish', 'seafood', 'vegetables', 'fruit',
'pasta', 'rice', 'cheese', 'chocolate', 'potato', 'lamb', 'turkey',
'beans', 'nuts', 'eggs', 'tofu'
],
'dietary': [
'vegetarian', 'vegan', 'gluten-free', 'low-carb', 'healthy', 'low-fat',
'diabetic', 'dairy-free', 'keto', 'paleo', 'whole30'
],
'cooking_method': [
'oven', 'stove-top', 'no-cook', 'microwave', 'slow-cooker', 'grilling',
'baking', 'roasting', 'frying', 'steaming', 'braising'
],
'difficulty': ['easy', 'beginner-cook', 'advanced', 'intermediate', 'quick'],
'time': [
'15-minutes-or-less', '30-minutes-or-less', '60-minutes-or-less',
'4-hours-or-less', 'weeknight'
],
'occasion': [
'holiday-event', 'christmas', 'thanksgiving', 'valentines-day',
'summer', 'winter', 'spring', 'fall', 'party', 'picnic'
]
}
return tag_categories
def setup_ingredient_groups():
ingredient_groups = {
'proteins': [
'chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 'turkey',
'lamb', 'bacon', 'ham', 'sausage', 'eggs', 'tofu', 'beans', 'lentils'
],
'vegetables': [
'onion', 'garlic', 'tomato', 'carrot', 'celery', 'pepper', 'mushroom',
'spinach', 'broccoli', 'zucchini', 'potato', 'sweet potato'
],
'grains_starches': [
'rice', 'pasta', 'bread', 'flour', 'oats', 'quinoa', 'barley', 'noodles'
],
'dairy': [
'milk', 'butter', 'cheese', 'cream', 'yogurt', 'sour cream', 'cream cheese'
]
}
return ingredient_groups
def load_and_clean_recipes(recipes_path):
print(f"Loading recipes from {recipes_path}")
# Load the CSV file
recipes_df = pd.read_csv(recipes_path)
# Clean the recipe names
recipes_df['name'] = recipes_df['name'].fillna('unknown recipe').astype(str).apply(clean_text)
# Update the dataframe
recipes_df['description'] = recipes_df['description'].fillna('').astype(str).apply(clean_text)
# cleaning tags and ingredients from string format
recipes_df['tags'] = recipes_df['tags'].apply(literal_eval)
recipes_df['ingredients'] = recipes_df['ingredients'].apply(literal_eval)
# Filter out recipes with no tags or ingredients
recipes_df = recipes_df[
(recipes_df['tags'].str.len() > 0) &
(recipes_df['ingredients'].str.len() > 0) &
(recipes_df['name'].str.len() > 0) &
(recipes_df['name'] != 'unknown recipe')
].reset_index(drop=True)
print(f"Final number of valid recipes: {len(recipes_df)}")
return recipes_df
def categorize_recipe_tags(recipe_tags, tag_categories):
categorized_tags = {}
# Initialize empty lists for each category
for category_name in tag_categories.keys():
categorized_tags[category_name] = []
# Check each tag
for tag in recipe_tags:
tag_lower = tag.lower()
# Check each category
for category_name in tag_categories.keys():
category_keywords = tag_categories[category_name]
# Check if any keyword matches this tag
for keyword in category_keywords:
if keyword in tag_lower:
categorized_tags[category_name].append(tag)
break
return categorized_tags
def extract_main_ingredients(ingredients_list, ingredient_groups):
if not ingredients_list or not isinstance(ingredients_list, list):
return []
# Clean each ingredient
cleaned_ingredients = []
for ingredient in ingredients_list:
# Convert to string
ingredient_string = str(ingredient) if ingredient is not None else ''
if not ingredient_string or ingredient_string == 'nan':
continue
# Make lowercase
cleaned_ingredient = ingredient_string.lower()
# Remove common descriptor words
words_to_remove = ['fresh', 'dried', 'chopped', 'minced', 'sliced', 'diced', 'ground', 'large', 'small', 'medium']
for word in words_to_remove:
cleaned_ingredient = cleaned_ingredient.replace(word, '')
# Remove numbers
cleaned_ingredient = re.sub(r'\d+', '', cleaned_ingredient)
# Remove measurement words
measurement_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'pound', 'pounds', 'ounce', 'ounces']
for measurement in measurement_words:
cleaned_ingredient = cleaned_ingredient.replace(measurement, '')
# Clean up extra spaces
cleaned_ingredient = re.sub(r'\s+', ' ', cleaned_ingredient).strip()
# Only keep if it's long enough
if cleaned_ingredient and len(cleaned_ingredient) > 2:
cleaned_ingredients.append(cleaned_ingredient)
# Put ingredients in order of importance
ordered_ingredients = []
# First, add proteins (most important)
for ingredient in cleaned_ingredients:
for protein in ingredient_groups['proteins']:
if protein in ingredient:
ordered_ingredients.append(ingredient)
break
# Then add vegetables, grains, and dairy
other_groups = ['vegetables', 'grains_starches', 'dairy']
for group_name in other_groups:
for ingredient in cleaned_ingredients:
if ingredient not in ordered_ingredients:
for group_item in ingredient_groups[group_name]:
if group_item in ingredient:
ordered_ingredients.append(ingredient)
break
# Finally, add any remaining ingredients
for ingredient in cleaned_ingredients:
if ingredient not in ordered_ingredients:
ordered_ingredients.append(ingredient)
return ordered_ingredients
def create_structured_recipe_text(recipe, tag_categories, ingredient_groups):
# Get recipe tags and categorize them
recipe_tags = recipe['tags'] if isinstance(recipe['tags'], list) else []
categorized_tags = categorize_recipe_tags(recipe_tags, tag_categories)
# Choose tags in priority order
priority_categories = ['main_ingredient', 'cuisine', 'course', 'dietary', 'cooking_method']
selected_tags = []
for category in priority_categories:
if category in categorized_tags:
# Take up to 2 tags from each category
category_tags = categorized_tags[category][:2]
for tag in category_tags:
selected_tags.append(tag)
# Add some additional important tags
important_keywords = ['easy', 'quick', 'healthy', 'spicy', 'sweet']
remaining_tags = []
for tag in recipe_tags:
if tag not in selected_tags:
for keyword in important_keywords:
if keyword in tag.lower():
remaining_tags.append(tag)
break
# Add up to 3 remaining tags
for i in range(min(3, len(remaining_tags))):
selected_tags.append(remaining_tags[i])
# Process ingredients
recipe_ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else []
main_ingredients = extract_main_ingredients(recipe_ingredients, ingredient_groups)
# Step 5: Create the final structured text
# Join first 8 ingredients
ingredients_text = ', '.join(main_ingredients[:8])
# Join first 10 tags
tags_text = ', '.join(selected_tags[:10])
# Get recipe name
recipe_name = str(recipe['name']).replace(' ', ' ').strip()
# Create final structured text
structured_text = f"Recipe: {recipe_name}. Ingredients: {ingredients_text}. Style: {tags_text}"
return structured_text
def create_recipe_statistics(interactions_path='RAW_interactions.csv'):
print("Creating recipe statistics")
# Load interactions data
interactions_df = pd.read_csv(interactions_path)
# Clean interactions data
interactions_df = interactions_df.dropna(subset=['rating'])
# Convert ratings to numbers
interactions_df['rating'] = pd.to_numeric(interactions_df['rating'], errors='coerce')
# Remove rows where rating conversion failed
interactions_df = interactions_df.dropna(subset=['rating'])
print(f"Valid interactions after cleaning: {len(interactions_df)}")
# Calculate statistics for each recipe
recipe_stats = {}
unique_recipe_ids = interactions_df['recipe_id'].unique()
for recipe_id in unique_recipe_ids:
# Get all interactions for this recipe
recipe_interactions = interactions_df[interactions_df['recipe_id'] == recipe_id]
# Calculate average rating
ratings_list = recipe_interactions['rating'].tolist()
average_rating = sum(ratings_list) / len(ratings_list)
# Count number of ratings
number_of_ratings = len(recipe_interactions)
# Count unique users
unique_users = recipe_interactions['user_id'].nunique()
recipe_stats[recipe_id] = (average_rating, number_of_ratings, unique_users)
print(f"Created statistics for {len(recipe_stats)} recipes")
return recipe_stats
def create_recipe_embeddings(recipes_df, model, tokenizer, device, tag_categories, ingredient_groups):
print("Creating recipe embeddings (this will take a long time)")
recipe_embeddings_list = []
valid_recipes_list = []
# Process each recipe one by one
for i in range(len(recipes_df)):
recipe = recipes_df.iloc[i]
try:
# Create structured text for this recipe
recipe_text = create_structured_recipe_text(recipe, tag_categories, ingredient_groups)
# Tokenize the recipe text
tokenized_input = tokenizer(
recipe_text,
return_tensors='pt',
truncation=True,
max_length=128,
padding='max_length'
)
# Get embedding from model
with torch.no_grad():
tokenized_input = tokenized_input['input_ids'].to(device)
tokenized_mask = tokenized_input['attention_mask'].to(device)
model_outputs = model(tokenized_input, tokenized_mask)
# Get CLS token embedding (first token)
cls_embedding = model_outputs.last_hidden_state[:, 0, :]
# Move to CPU and convert to numpy
embedding_numpy = cls_embedding.cpu().numpy().flatten()
# Store the embedding and recipe
recipe_embeddings_list.append(embedding_numpy)
valid_recipes_list.append(recipe.copy())
# Show progress every 1000 recipes
if len(recipe_embeddings_list) % 1000 == 0:
print(f"Processed {len(recipe_embeddings_list)} recipes")
except Exception as e:
print(f"Error processing recipe {recipe.get('id', i)}: {e}")
continue
# Convert list to numpy array
embeddings_array = np.array(recipe_embeddings_list)
# Create new dataframe with only valid recipes
valid_recipes_df = pd.DataFrame(valid_recipes_list)
valid_recipes_df = valid_recipes_df.reset_index(drop=True)
print(f"Created {len(embeddings_array)} recipe embeddings")
return embeddings_array, valid_recipes_df
def save_all_files(recipes_df, recipe_embeddings, recipe_stats):
print("Saving all files...")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
np.save(f'recipe_embeddings_{timestamp}.npy', recipe_embeddings)
print(f"Saved embeddings")
# Save filtered recipes dataframe
with open(f'filtered_recipes_{timestamp}.pkl', 'wb') as f:
pickle.dump(recipes_df, f)
print(f"Saved recipes.")
# Save recipe statistics
with open(f'recipe_statistics_{timestamp}.pkl', 'wb') as f:
pickle.dump(recipe_stats, f)
print(f"Saved statistics")
print("All files saved successfully!")
def create_all_necessary_files(recipes_path, interactions_path, model_path):
print("Starting full preprocessing pipeline")
# Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load the trained model
model = BertModel.from_pretrained('bert-base-uncased')
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()
# Set up tag categories and ingredient groups
tag_categories = setup_tag_categories()
ingredient_groups = setup_ingredient_groups()
# Load and clean recipes
recipes_df = load_and_clean_recipes(recipes_path)
# Create recipe statistics
recipe_stats = create_recipe_statistics(interactions_path)
# Create recipe embeddings
recipe_embeddings, filtered_recipes_df = create_recipe_embeddings(
recipes_df, model, tokenizer, device, tag_categories, ingredient_groups
)
# Save all files
save_all_files(filtered_recipes_df, recipe_embeddings, recipe_stats)
if __name__ == "__main__":
create_all_necessary_files(
recipes_path='RAW_recipes.csv',
interactions_path='RAW_interactions.csv',
model_path='tag_based_bert_model.pth'
)
print("All preprocessing complete! You can now use the search system.")