Spaces:
Sleeping
Sleeping
Palbha Kulkarni (Nazwale)
commited on
Create baseline_code.py
Browse files- baseline_code.py +109 -0
baseline_code.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import openai
|
3 |
+
df=pd.read_csv("/data/faq_data.csv")
|
4 |
+
print(df)
|
5 |
+
|
6 |
+
from openai import OpenAI
|
7 |
+
from google.colab import userdata
|
8 |
+
#Feel free to use chatgpt or other models - Google offere free gemini api hence I have used this
|
9 |
+
|
10 |
+
|
11 |
+
import pandas as pd
|
12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
13 |
+
import numpy as np
|
14 |
+
|
15 |
+
|
16 |
+
# Optional: chunk answers into smaller pieces (e.g. 200 tokens) for better retrieval
|
17 |
+
def chunk_text(text, max_tokens=200):
|
18 |
+
words = text.split()
|
19 |
+
for i in range(0, len(words), max_tokens):
|
20 |
+
yield " ".join(words[i:i+max_tokens])
|
21 |
+
|
22 |
+
chunks = []
|
23 |
+
metadata = []
|
24 |
+
for idx, row in df.iterrows():
|
25 |
+
for c in chunk_text(row['answer']):
|
26 |
+
chunks.append(c)
|
27 |
+
metadata.append({
|
28 |
+
"topic": row['topic'],
|
29 |
+
"question": row['question'],
|
30 |
+
"answer_chunk": c
|
31 |
+
})
|
32 |
+
|
33 |
+
|
34 |
+
from sentence_transformers import SentenceTransformer
|
35 |
+
import numpy as np
|
36 |
+
#I am using the huggingface model to generate embeddings , If the data was large once can try & explore other embeddings
|
37 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
38 |
+
|
39 |
+
# Embed chunks
|
40 |
+
chunk_vectors = embedder.encode(chunks, convert_to_numpy=True)
|
41 |
+
|
42 |
+
# Build FAISS index as before
|
43 |
+
import faiss
|
44 |
+
|
45 |
+
dimension = chunk_vectors.shape[1]
|
46 |
+
index = faiss.IndexFlatL2(dimension)
|
47 |
+
index.add(chunk_vectors)
|
48 |
+
|
49 |
+
# For query embedding function
|
50 |
+
def embed_query(query):
|
51 |
+
return embedder.encode([query], convert_to_numpy=True)[0]
|
52 |
+
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
|
53 |
+
tfidf_matrix = tfidf_vectorizer.fit_transform(df['question'])
|
54 |
+
def retrieve_similar_answer_chunks(query, k=5):
|
55 |
+
q_vec = embed_query(query)
|
56 |
+
D, I = index.search(np.array([q_vec]).astype('float32'), k)
|
57 |
+
return [metadata[i] for i in I[0]]
|
58 |
+
|
59 |
+
def retrieve_similar_questions(query, k=5):
|
60 |
+
q_tfidf = tfidf_vectorizer.transform([query])
|
61 |
+
scores = (tfidf_matrix @ q_tfidf.T).toarray()
|
62 |
+
topk_idx = scores[:,0].argsort()[-k:][::-1]
|
63 |
+
return df.iloc[topk_idx][['topic', 'question', 'answer']].to_dict(orient='records')
|
64 |
+
def hybrid_retrieve(query, k=5):
|
65 |
+
answer_chunks = retrieve_similar_answer_chunks(query, k)
|
66 |
+
question_hits = retrieve_similar_questions(query, k)
|
67 |
+
combined_contexts = answer_chunks + question_hits
|
68 |
+
|
69 |
+
# Deduplicate if needed
|
70 |
+
seen = set()
|
71 |
+
filtered = []
|
72 |
+
for c in combined_contexts:
|
73 |
+
key = (c.get('topic'), c.get('question'))
|
74 |
+
if key not in seen:
|
75 |
+
filtered.append(c)
|
76 |
+
seen.add(key)
|
77 |
+
return filtered
|
78 |
+
|
79 |
+
def generate_answer(query,model_choice,api_key):
|
80 |
+
contexts = hybrid_retrieve(query, k=5)
|
81 |
+
context_text = "\n\n".join([f"Q: {c['question']}\nA: {c.get('answer') or c.get('answer_chunk')}" for c in contexts])
|
82 |
+
|
83 |
+
messages = [
|
84 |
+
{
|
85 |
+
"role": "system",
|
86 |
+
"content": (
|
87 |
+
"You are an expert airline assistant answering user queries based on provided context."
|
88 |
+
" Use the context to generate a helpful, factual, self-contained answer."
|
89 |
+
"If the context doesn't help & you are unable to answer - Please reply to the user to reach out to our customer service call center at 1 800 800 000 or email us at [email protected]"
|
90 |
+
"If the user is asking some random question not related to be asked to an airline assistant reply - I don't understand your question can you please rephrase"
|
91 |
+
)
|
92 |
+
},
|
93 |
+
{
|
94 |
+
"role": "user",
|
95 |
+
"content": f"Context:\n{context_text}\n\nQuestion: {query}\nAnswer:"
|
96 |
+
}
|
97 |
+
]
|
98 |
+
openai = OpenAI(
|
99 |
+
base_url="https://generativelanguage.googleapis.com/v1beta/",
|
100 |
+
api_key=api_key,
|
101 |
+
)
|
102 |
+
|
103 |
+
response = openai.chat.completions.create(
|
104 |
+
model="gemini-1.5-flash",
|
105 |
+
messages=messages,
|
106 |
+
temperature=0.7,
|
107 |
+
max_tokens=700,
|
108 |
+
)
|
109 |
+
return response.choices[0].message.content
|