IFX-sandbox / data /z_old /create_embeddings.py
aliss77777's picture
Upload folder using huggingface_hub
06cb2a3 verified
import pandas as pd
from openai import OpenAI
import os
from dotenv import load_dotenv
import numpy as np
# Load environment variables from .env file (for API key)
load_dotenv()
# Set up OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def get_embedding(text):
"""Get embedding for text using OpenAI's text-embedding-3-small."""
if pd.isna(text) or text == "Specific game details are not available.":
# Return an array of zeros for missing data or non-specific summaries
return [0] * 1536 # text-embedding-3-small produces 1536-dimensional embeddings
response = client.embeddings.create(
input=text.strip(),
model="text-embedding-3-small"
)
return response.data[0].embedding
def main():
# Read the CSV file
input_path = "merged/data/niners_output/schedule_with_result.csv"
output_path = "merged/data/niners_output/schedule_with_result_embedding.csv"
print(f"Reading from {input_path}")
df = pd.read_csv(input_path)
# Check if Summary column exists
if "Summary" not in df.columns:
print("Error: 'Summary' column not found in the CSV file.")
return
# Generate embeddings for each summary
print("Generating embeddings...")
# Add embeddings directly to the original dataframe
df['embedding'] = df['Summary'].apply(get_embedding)
# Save to CSV
print(f"Saving embeddings to {output_path}")
df.to_csv(output_path, index=False)
print("Done!")
if __name__ == "__main__":
main()