Spaces:
Sleeping
Sleeping
File size: 4,551 Bytes
7f681f1 97559d4 7f681f1 785d301 7f681f1 9311095 b5476e6 7f681f1 cc846af 7f681f1 b5476e6 7f681f1 785d301 ae70460 785d301 7f681f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import pandas as pd
import openai
df=pd.read_csv("data/faq_data.csv")
print(df)
from openai import OpenAI
#Feel free to use chatgpt or other models - Google offere free gemini api hence I have used this
model_name = {
"OpenAI GPT-4": {
"name": "gpt-4",
"url": "https://api.openai.com/v1/" # Replace with correct OpenAI base URL
},
"Gemini 1.5Flash": {
"name": "gemini-1.5-flash",
"url": "https://generativelanguage.googleapis.com/v1beta/"
},
"Gemini 2.0Flash": {
"name": "gemini-2.0-flash",
"url": "https://generativelanguage.googleapis.com/v1beta/"
},
"Together AI": {
"name": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"url": "https://api.together.xyz/v1/" # Replace with correct Together base URL
}
}
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# Optional: chunk answers into smaller pieces (e.g. 200 tokens) for better retrieval
def chunk_text(text, max_tokens=200):
words = text.split()
for i in range(0, len(words), max_tokens):
yield " ".join(words[i:i+max_tokens])
chunks = []
metadata = []
for idx, row in df.iterrows():
for c in chunk_text(row['answer']):
chunks.append(c)
metadata.append({
"topic": row['topic'],
"question": row['question'],
"answer_chunk": c
})
from sentence_transformers import SentenceTransformer
import numpy as np
#I am using the huggingface model to generate embeddings , If the data was large once can try & explore other embeddings
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# Embed chunks
chunk_vectors = embedder.encode(chunks, convert_to_numpy=True)
# Build FAISS index as before
import faiss
dimension = chunk_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(chunk_vectors)
# For query embedding function
def embed_query(query):
return embedder.encode([query], convert_to_numpy=True)[0]
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['question'])
def retrieve_similar_answer_chunks(query, k=5):
q_vec = embed_query(query)
D, I = index.search(np.array([q_vec]).astype('float32'), k)
return [metadata[i] for i in I[0]]
def retrieve_similar_questions(query, k=5):
q_tfidf = tfidf_vectorizer.transform([query])
scores = (tfidf_matrix @ q_tfidf.T).toarray()
topk_idx = scores[:,0].argsort()[-k:][::-1]
return df.iloc[topk_idx][['topic', 'question', 'answer']].to_dict(orient='records')
def hybrid_retrieve(query, k=5):
answer_chunks = retrieve_similar_answer_chunks(query, k)
question_hits = retrieve_similar_questions(query, k)
combined_contexts = answer_chunks + question_hits
# Deduplicate if needed
seen = set()
filtered = []
for c in combined_contexts:
key = (c.get('topic'), c.get('question'))
if key not in seen:
filtered.append(c)
seen.add(key)
return filtered
def generate_answer(message,model_choice,api_key):
contexts = hybrid_retrieve(message, k=5)
context_text = "\n\n".join([f"Q: {c['question']}\nA: {c.get('answer') or c.get('answer_chunk')}" for c in contexts])
messages = [
{
"role": "system",
"content": (
"You are an expert airline assistant answering user queries based on the provided context."
" Use the context to generate a helpful, factual, self-contained answer."
"If the context doesn't help and you are unable to answer, please reply to the user to reach out to our customer service call center at 1 800 800 000 or email us at [email protected]."
"If the user is just greeting (e.g., says 'hi', 'hello', 'good morning', etc.), you can greet back briefly and ask how you may assist."
"If the user is asking something random and unrelated to airline services and it's not a greeting, reply: 'I don't understand your question, can you please rephrase?'"
) },
{
"role": "user",
"content": f"Context:\n{context_text}\n\nQuestion: {message}\nAnswer:"
}
]
config = model_name[model_choice]
openai=OpenAI(
base_url=config["url"],
api_key=api_key,
)
response = openai.chat.completions.create(
model="gemini-1.5-flash",
messages=messages,
temperature=0.7,
max_tokens=700,
)
return response.choices[0].message.content
|