Spaces:
Sleeping
Sleeping
import pandas as pd | |
import openai | |
df=pd.read_csv("data/faq_data.csv") | |
print(df) | |
from openai import OpenAI | |
#Feel free to use chatgpt or other models - Google offere free gemini api hence I have used this | |
model_name = { | |
"OpenAI GPT-4": { | |
"name": "gpt-4", | |
"url": "https://api.openai.com/v1/" # Replace with correct OpenAI base URL | |
}, | |
"Gemini 1.5Flash": { | |
"name": "gemini-1.5-flash", | |
"url": "https://generativelanguage.googleapis.com/v1beta/" | |
}, | |
"Gemini 2.0Flash": { | |
"name": "gemini-2.0-flash", | |
"url": "https://generativelanguage.googleapis.com/v1beta/" | |
}, | |
"Together AI": { | |
"name": "mistralai/Mixtral-8x7B-Instruct-v0.1", | |
"url": "https://api.together.xyz/v1/" # Replace with correct Together base URL | |
} | |
} | |
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import numpy as np | |
# Optional: chunk answers into smaller pieces (e.g. 200 tokens) for better retrieval | |
def chunk_text(text, max_tokens=200): | |
words = text.split() | |
for i in range(0, len(words), max_tokens): | |
yield " ".join(words[i:i+max_tokens]) | |
chunks = [] | |
metadata = [] | |
for idx, row in df.iterrows(): | |
for c in chunk_text(row['answer']): | |
chunks.append(c) | |
metadata.append({ | |
"topic": row['topic'], | |
"question": row['question'], | |
"answer_chunk": c | |
}) | |
from sentence_transformers import SentenceTransformer | |
import numpy as np | |
#I am using the huggingface model to generate embeddings , If the data was large once can try & explore other embeddings | |
embedder = SentenceTransformer('all-MiniLM-L6-v2') | |
# Embed chunks | |
chunk_vectors = embedder.encode(chunks, convert_to_numpy=True) | |
# Build FAISS index as before | |
import faiss | |
dimension = chunk_vectors.shape[1] | |
index = faiss.IndexFlatL2(dimension) | |
index.add(chunk_vectors) | |
# For query embedding function | |
def embed_query(query): | |
return embedder.encode([query], convert_to_numpy=True)[0] | |
tfidf_vectorizer = TfidfVectorizer(stop_words='english') | |
tfidf_matrix = tfidf_vectorizer.fit_transform(df['question']) | |
def retrieve_similar_answer_chunks(query, k=5): | |
q_vec = embed_query(query) | |
D, I = index.search(np.array([q_vec]).astype('float32'), k) | |
return [metadata[i] for i in I[0]] | |
def retrieve_similar_questions(query, k=5): | |
q_tfidf = tfidf_vectorizer.transform([query]) | |
scores = (tfidf_matrix @ q_tfidf.T).toarray() | |
topk_idx = scores[:,0].argsort()[-k:][::-1] | |
return df.iloc[topk_idx][['topic', 'question', 'answer']].to_dict(orient='records') | |
def hybrid_retrieve(query, k=5): | |
answer_chunks = retrieve_similar_answer_chunks(query, k) | |
question_hits = retrieve_similar_questions(query, k) | |
combined_contexts = answer_chunks + question_hits | |
# Deduplicate if needed | |
seen = set() | |
filtered = [] | |
for c in combined_contexts: | |
key = (c.get('topic'), c.get('question')) | |
if key not in seen: | |
filtered.append(c) | |
seen.add(key) | |
return filtered | |
def generate_answer(message,model_choice,api_key): | |
contexts = hybrid_retrieve(message, k=5) | |
context_text = "\n\n".join([f"Q: {c['question']}\nA: {c.get('answer') or c.get('answer_chunk')}" for c in contexts]) | |
messages = [ | |
{ | |
"role": "system", | |
"content": ( | |
"You are an expert airline assistant answering user queries based on the provided context." | |
" Use the context to generate a helpful, factual, self-contained answer." | |
"If the context doesn't help and you are unable to answer, please reply to the user to reach out to our customer service call center at 1 800 800 000 or email us at [email protected]." | |
"If the user is just greeting (e.g., says 'hi', 'hello', 'good morning', etc.), you can greet back briefly and ask how you may assist." | |
"If the user is asking something random and unrelated to airline services and it's not a greeting, reply: 'I don't understand your question, can you please rephrase?'" | |
) }, | |
{ | |
"role": "user", | |
"content": f"Context:\n{context_text}\n\nQuestion: {message}\nAnswer:" | |
} | |
] | |
config = model_name[model_choice] | |
openai=OpenAI( | |
base_url=config["url"], | |
api_key=api_key, | |
) | |
response = openai.chat.completions.create( | |
model="gemini-1.5-flash", | |
messages=messages, | |
temperature=0.7, | |
max_tokens=700, | |
) | |
return response.choices[0].message.content | |