Spaces:

PatternGroup5
/

pattern

Sleeping

File size: 14,919 Bytes

733fcd8

import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from ast import literal_eval
import re
import pickle
from datetime import datetime

def clean_text(text):
    #helper function to clean the text from whitespace, double spaces
    # converts to lowercase and checks if the text is a string first to avoid errors
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = ' '.join(text.split())
    return text.strip()

def setup_tag_categories():
    tag_categories = {
        'cuisine': [
            'italian', 'chinese', 'mexican', 'indian', 'french', 'greek', 'thai', 
            'japanese', 'american', 'european', 'asian', 'mediterranean', 'spanish', 
            'german', 'korean', 'vietnamese', 'turkish', 'moroccan', 'lebanese'
        ],
        'course': [
            'main-dish', 'side-dishes', 'appetizers', 'desserts', 'breakfast', 
            'lunch', 'dinner', 'snacks', 'beverages', 'salads', 'soups'
        ],
        'main_ingredient': [
            'chicken', 'beef', 'pork', 'fish', 'seafood', 'vegetables', 'fruit', 
            'pasta', 'rice', 'cheese', 'chocolate', 'potato', 'lamb', 'turkey',
            'beans', 'nuts', 'eggs', 'tofu'
        ],
        'dietary': [
            'vegetarian', 'vegan', 'gluten-free', 'low-carb', 'healthy', 'low-fat', 
            'diabetic', 'dairy-free', 'keto', 'paleo', 'whole30'
        ],
        'cooking_method': [
            'oven', 'stove-top', 'no-cook', 'microwave', 'slow-cooker', 'grilling', 
            'baking', 'roasting', 'frying', 'steaming', 'braising'
        ],
        'difficulty': ['easy', 'beginner-cook', 'advanced', 'intermediate', 'quick'],
        'time': [
            '15-minutes-or-less', '30-minutes-or-less', '60-minutes-or-less', 
            '4-hours-or-less', 'weeknight'
        ],
        'occasion': [
            'holiday-event', 'christmas', 'thanksgiving', 'valentines-day', 
            'summer', 'winter', 'spring', 'fall', 'party', 'picnic'
        ]
    }
    return tag_categories

def setup_ingredient_groups():
    
    ingredient_groups = {
        'proteins': [
            'chicken', 'beef', 'pork', 'fish', 'salmon', 'tuna', 'shrimp', 'turkey',
            'lamb', 'bacon', 'ham', 'sausage', 'eggs', 'tofu', 'beans', 'lentils'
        ],
        'vegetables': [
            'onion', 'garlic', 'tomato', 'carrot', 'celery', 'pepper', 'mushroom',
            'spinach', 'broccoli', 'zucchini', 'potato', 'sweet potato'
        ],
        'grains_starches': [
            'rice', 'pasta', 'bread', 'flour', 'oats', 'quinoa', 'barley', 'noodles'
        ],
        'dairy': [
            'milk', 'butter', 'cheese', 'cream', 'yogurt', 'sour cream', 'cream cheese'
        ]
    }
    return ingredient_groups

def load_and_clean_recipes(recipes_path):
    print(f"Loading recipes from {recipes_path}")
    
    # Load the CSV file
    recipes_df = pd.read_csv(recipes_path)
    
    # Clean the recipe names
    recipes_df['name'] = recipes_df['name'].fillna('unknown recipe').astype(str).apply(clean_text)
    
    # Update the dataframe
    recipes_df['description'] = recipes_df['description'].fillna('').astype(str).apply(clean_text)
    
    # cleaning tags and ingredients from string format
    recipes_df['tags'] = recipes_df['tags'].apply(literal_eval)
    recipes_df['ingredients'] = recipes_df['ingredients'].apply(literal_eval)
    
    # Filter out recipes with no tags or ingredients
    recipes_df = recipes_df[
        (recipes_df['tags'].str.len() > 0) & 
        (recipes_df['ingredients'].str.len() > 0) & 
        (recipes_df['name'].str.len() > 0) & 
        (recipes_df['name'] != 'unknown recipe')
    ].reset_index(drop=True)
    
    
    print(f"Final number of valid recipes: {len(recipes_df)}")
    return recipes_df

def categorize_recipe_tags(recipe_tags, tag_categories):
    categorized_tags = {}
    
    # Initialize empty lists for each category
    for category_name in tag_categories.keys():
        categorized_tags[category_name] = []
    
    # Check each tag
    for tag in recipe_tags:
        tag_lower = tag.lower()
        
        # Check each category
        for category_name in tag_categories.keys():
            category_keywords = tag_categories[category_name]
            
            # Check if any keyword matches this tag
            for keyword in category_keywords:
                if keyword in tag_lower:
                    categorized_tags[category_name].append(tag)
                    break
    
    return categorized_tags

def extract_main_ingredients(ingredients_list, ingredient_groups):
    if not ingredients_list or not isinstance(ingredients_list, list):
        return []
    
    # Clean each ingredient
    cleaned_ingredients = []
    
    for ingredient in ingredients_list:
        # Convert to string
        ingredient_string = str(ingredient) if ingredient is not None else ''
        if not ingredient_string or ingredient_string == 'nan':
            continue
        
        # Make lowercase
        cleaned_ingredient = ingredient_string.lower()
        
        # Remove common descriptor words
        words_to_remove = ['fresh', 'dried', 'chopped', 'minced', 'sliced', 'diced', 'ground', 'large', 'small', 'medium']
        for word in words_to_remove:
            cleaned_ingredient = cleaned_ingredient.replace(word, '')
        
        # Remove numbers
        cleaned_ingredient = re.sub(r'\d+', '', cleaned_ingredient)
        
        # Remove measurement words
        measurement_words = ['cup', 'cups', 'tablespoon', 'tablespoons', 'teaspoon', 'teaspoons', 'pound', 'pounds', 'ounce', 'ounces']
        for measurement in measurement_words:
            cleaned_ingredient = cleaned_ingredient.replace(measurement, '')
        
        # Clean up extra spaces
        cleaned_ingredient = re.sub(r'\s+', ' ', cleaned_ingredient).strip()
        
        # Only keep if it's long enough
        if cleaned_ingredient and len(cleaned_ingredient) > 2:
            cleaned_ingredients.append(cleaned_ingredient)

    
    # Put ingredients in order of importance
    ordered_ingredients = []
    
    # First, add proteins (most important)
    for ingredient in cleaned_ingredients:
        for protein in ingredient_groups['proteins']:
            if protein in ingredient:
                ordered_ingredients.append(ingredient)
                break
        
    
    # Then add vegetables, grains, and dairy
    other_groups = ['vegetables', 'grains_starches', 'dairy']
    for group_name in other_groups:
        for ingredient in cleaned_ingredients:
            if ingredient not in ordered_ingredients:
                for group_item in ingredient_groups[group_name]:
                    if group_item in ingredient:
                        ordered_ingredients.append(ingredient)
                        break
    
    # Finally, add any remaining ingredients
    for ingredient in cleaned_ingredients:
        if ingredient not in ordered_ingredients:
            ordered_ingredients.append(ingredient)
    
    return ordered_ingredients

def create_structured_recipe_text(recipe, tag_categories, ingredient_groups):
    # Get recipe tags and categorize them
    recipe_tags = recipe['tags'] if isinstance(recipe['tags'], list) else []
    categorized_tags = categorize_recipe_tags(recipe_tags, tag_categories)
    
    # Choose tags in priority order
    priority_categories = ['main_ingredient', 'cuisine', 'course', 'dietary', 'cooking_method']
    selected_tags = []
    
    for category in priority_categories:
        if category in categorized_tags:
            # Take up to 2 tags from each category
            category_tags = categorized_tags[category][:2]
            for tag in category_tags:
                selected_tags.append(tag)
    
    # Add some additional important tags
    important_keywords = ['easy', 'quick', 'healthy', 'spicy', 'sweet']
    remaining_tags = []
    
    for tag in recipe_tags:
        if tag not in selected_tags:  
            for keyword in important_keywords:
                if keyword in tag.lower():
                    remaining_tags.append(tag)
                    break
            
    
    # Add up to 3 remaining tags
    for i in range(min(3, len(remaining_tags))):
        selected_tags.append(remaining_tags[i])
    
    # Process ingredients
    recipe_ingredients = recipe['ingredients'] if isinstance(recipe['ingredients'], list) else []
    main_ingredients = extract_main_ingredients(recipe_ingredients, ingredient_groups)
    
    # Step 5: Create the final structured text
    # Join first 8 ingredients
    ingredients_text = ', '.join(main_ingredients[:8])
    
    # Join first 10 tags
    tags_text = ', '.join(selected_tags[:10])
    
    # Get recipe name
    recipe_name = str(recipe['name']).replace('  ', ' ').strip()
    
    # Create final structured text
    structured_text = f"Recipe: {recipe_name}. Ingredients: {ingredients_text}. Style: {tags_text}"
    
    return structured_text
        

def create_recipe_statistics(interactions_path='RAW_interactions.csv'):
    print("Creating recipe statistics")
        
    # Load interactions data
    interactions_df = pd.read_csv(interactions_path)    
    # Clean interactions data    
    interactions_df = interactions_df.dropna(subset=['rating'])
    # Convert ratings to numbers
    interactions_df['rating'] = pd.to_numeric(interactions_df['rating'], errors='coerce')
    
    # Remove rows where rating conversion failed
    interactions_df = interactions_df.dropna(subset=['rating'])
    
    print(f"Valid interactions after cleaning: {len(interactions_df)}")
    
    # Calculate statistics for each recipe
    recipe_stats = {}
    unique_recipe_ids = interactions_df['recipe_id'].unique()
    
    for recipe_id in unique_recipe_ids:
        # Get all interactions for this recipe
        recipe_interactions = interactions_df[interactions_df['recipe_id'] == recipe_id]
        # Calculate average rating
        ratings_list = recipe_interactions['rating'].tolist()
        average_rating = sum(ratings_list) / len(ratings_list)
        # Count number of ratings
        number_of_ratings = len(recipe_interactions)
        # Count unique users
        unique_users = recipe_interactions['user_id'].nunique()
        
        recipe_stats[recipe_id] = (average_rating, number_of_ratings, unique_users)
    
    print(f"Created statistics for {len(recipe_stats)} recipes")
    return recipe_stats

def create_recipe_embeddings(recipes_df, model, tokenizer, device, tag_categories, ingredient_groups):
    print("Creating recipe embeddings (this will take a long time)")
    
    recipe_embeddings_list = []
    valid_recipes_list = []
    
    # Process each recipe one by one
    for i in range(len(recipes_df)):
        recipe = recipes_df.iloc[i]
        
        try:
            # Create structured text for this recipe
            recipe_text = create_structured_recipe_text(recipe, tag_categories, ingredient_groups)
            
            # Tokenize the recipe text
            tokenized_input = tokenizer(
                recipe_text,
                return_tensors='pt',
                truncation=True,
                max_length=128,
                padding='max_length'
            )
            
            
            # Get embedding from model
            with torch.no_grad():
                tokenized_input = tokenized_input['input_ids'].to(device)
                tokenized_mask = tokenized_input['attention_mask'].to(device)
                model_outputs = model(tokenized_input, tokenized_mask)
                # Get CLS token embedding (first token)
                cls_embedding = model_outputs.last_hidden_state[:, 0, :]
                # Move to CPU and convert to numpy
                embedding_numpy = cls_embedding.cpu().numpy().flatten()
            
            # Store the embedding and recipe
            recipe_embeddings_list.append(embedding_numpy)
            valid_recipes_list.append(recipe.copy())
            
            # Show progress every 1000 recipes
            if len(recipe_embeddings_list) % 1000 == 0:
                print(f"Processed {len(recipe_embeddings_list)} recipes")
                
        except Exception as e:
            print(f"Error processing recipe {recipe.get('id', i)}: {e}")
            continue
        
    # Convert list to numpy array
    embeddings_array = np.array(recipe_embeddings_list)
    
    # Create new dataframe with only valid recipes
    valid_recipes_df = pd.DataFrame(valid_recipes_list)
    valid_recipes_df = valid_recipes_df.reset_index(drop=True)
    
    print(f"Created {len(embeddings_array)} recipe embeddings")
    return embeddings_array, valid_recipes_df

def save_all_files(recipes_df, recipe_embeddings, recipe_stats):
    print("Saving all files...")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    np.save(f'recipe_embeddings_{timestamp}.npy', recipe_embeddings)
    print(f"Saved embeddings")
    
    # Save filtered recipes dataframe
    with open(f'filtered_recipes_{timestamp}.pkl', 'wb') as f:
        pickle.dump(recipes_df, f)
    print(f"Saved recipes.")
    
    # Save recipe statistics
    with open(f'recipe_statistics_{timestamp}.pkl', 'wb') as f:
        pickle.dump(recipe_stats, f)
    print(f"Saved statistics")
    
    print("All files saved successfully!")

def create_all_necessary_files(recipes_path, interactions_path, model_path):
    print("Starting full preprocessing pipeline")
    
    # Set up device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    # Load the trained model
    model = BertModel.from_pretrained('bert-base-uncased')
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    
    # Set up tag categories and ingredient groups
    tag_categories = setup_tag_categories()
    ingredient_groups = setup_ingredient_groups()
    
    # Load and clean recipes
    recipes_df = load_and_clean_recipes(recipes_path)
    
    # Create recipe statistics
    recipe_stats = create_recipe_statistics(interactions_path)
    
    # Create recipe embeddings
    recipe_embeddings, filtered_recipes_df = create_recipe_embeddings(
        recipes_df, model, tokenizer, device, tag_categories, ingredient_groups
    )
    
    # Save all files
    save_all_files(filtered_recipes_df, recipe_embeddings, recipe_stats)

if __name__ == "__main__":
    create_all_necessary_files(
        recipes_path='RAW_recipes.csv',
        interactions_path='RAW_interactions.csv',
        model_path='tag_based_bert_model.pth'
    )
    
    print("All preprocessing complete! You can now use the search system.")