Spaces:

palbha
/

airline_faq_rag_agent

Sleeping

airline_faq_rag_agent / baseline_code.py

Palbha Kulkarni (Nazwale)

Create baseline_code.py

7f681f1 unverified 4 months ago

3.84 kB

	import pandas as pd
	import openai
	df=pd.read_csv("/data/faq_data.csv")
	print(df)

	from openai import OpenAI
	from google.colab import userdata
	#Feel free to use chatgpt or other models - Google offere free gemini api hence I have used this


	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	import numpy as np


	# Optional: chunk answers into smaller pieces (e.g. 200 tokens) for better retrieval
	def chunk_text(text, max_tokens=200):
	words = text.split()
	for i in range(0, len(words), max_tokens):
	yield " ".join(words[i:i+max_tokens])

	chunks = []
	metadata = []
	for idx, row in df.iterrows():
	for c in chunk_text(row['answer']):
	chunks.append(c)
	metadata.append({
	"topic": row['topic'],
	"question": row['question'],
	"answer_chunk": c
	})


	from sentence_transformers import SentenceTransformer
	import numpy as np
	#I am using the huggingface model to generate embeddings , If the data was large once can try & explore other embeddings
	embedder = SentenceTransformer('all-MiniLM-L6-v2')

	# Embed chunks
	chunk_vectors = embedder.encode(chunks, convert_to_numpy=True)

	# Build FAISS index as before
	import faiss

	dimension = chunk_vectors.shape[1]
	index = faiss.IndexFlatL2(dimension)
	index.add(chunk_vectors)

	# For query embedding function
	def embed_query(query):
	return embedder.encode([query], convert_to_numpy=True)[0]
	tfidf_vectorizer = TfidfVectorizer(stop_words='english')
	tfidf_matrix = tfidf_vectorizer.fit_transform(df['question'])
	def retrieve_similar_answer_chunks(query, k=5):
	q_vec = embed_query(query)
	D, I = index.search(np.array([q_vec]).astype('float32'), k)
	return [metadata[i] for i in I[0]]

	def retrieve_similar_questions(query, k=5):
	q_tfidf = tfidf_vectorizer.transform([query])
	scores = (tfidf_matrix @ q_tfidf.T).toarray()
	topk_idx = scores[:,0].argsort()[-k:][::-1]
	return df.iloc[topk_idx][['topic', 'question', 'answer']].to_dict(orient='records')
	def hybrid_retrieve(query, k=5):
	answer_chunks = retrieve_similar_answer_chunks(query, k)
	question_hits = retrieve_similar_questions(query, k)
	combined_contexts = answer_chunks + question_hits

	# Deduplicate if needed
	seen = set()
	filtered = []
	for c in combined_contexts:
	key = (c.get('topic'), c.get('question'))
	if key not in seen:
	filtered.append(c)
	seen.add(key)
	return filtered

	def generate_answer(query,model_choice,api_key):
	contexts = hybrid_retrieve(query, k=5)
	context_text = "\n\n".join([f"Q: {c['question']}\nA: {c.get('answer') or c.get('answer_chunk')}" for c in contexts])

	messages = [
	{
	"role": "system",
	"content": (
	"You are an expert airline assistant answering user queries based on provided context."
	" Use the context to generate a helpful, factual, self-contained answer."
	"If the context doesn't help & you are unable to answer - Please reply to the user to reach out to our customer service call center at 1 800 800 000 or email us at [email protected]"
	"If the user is asking some random question not related to be asked to an airline assistant reply - I don't understand your question can you please rephrase"
	)
	},
	{
	"role": "user",
	"content": f"Context:\n{context_text}\n\nQuestion: {query}\nAnswer:"
	}
	]
	openai = OpenAI(
	base_url="https://generativelanguage.googleapis.com/v1beta/",
	api_key=api_key,
	)

	response = openai.chat.completions.create(
	model="gemini-1.5-flash",
	messages=messages,
	temperature=0.7,
	max_tokens=700,
	)
	return response.choices[0].message.content