Palbha Kulkarni (Nazwale) commited on
Commit
7f681f1
·
unverified ·
1 Parent(s): 4045119

Create baseline_code.py

Browse files
Files changed (1) hide show
  1. baseline_code.py +109 -0
baseline_code.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import openai
3
+ df=pd.read_csv("/data/faq_data.csv")
4
+ print(df)
5
+
6
+ from openai import OpenAI
7
+ from google.colab import userdata
8
+ #Feel free to use chatgpt or other models - Google offere free gemini api hence I have used this
9
+
10
+
11
+ import pandas as pd
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ import numpy as np
14
+
15
+
16
+ # Optional: chunk answers into smaller pieces (e.g. 200 tokens) for better retrieval
17
+ def chunk_text(text, max_tokens=200):
18
+ words = text.split()
19
+ for i in range(0, len(words), max_tokens):
20
+ yield " ".join(words[i:i+max_tokens])
21
+
22
+ chunks = []
23
+ metadata = []
24
+ for idx, row in df.iterrows():
25
+ for c in chunk_text(row['answer']):
26
+ chunks.append(c)
27
+ metadata.append({
28
+ "topic": row['topic'],
29
+ "question": row['question'],
30
+ "answer_chunk": c
31
+ })
32
+
33
+
34
+ from sentence_transformers import SentenceTransformer
35
+ import numpy as np
36
+ #I am using the huggingface model to generate embeddings , If the data was large once can try & explore other embeddings
37
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
38
+
39
+ # Embed chunks
40
+ chunk_vectors = embedder.encode(chunks, convert_to_numpy=True)
41
+
42
+ # Build FAISS index as before
43
+ import faiss
44
+
45
+ dimension = chunk_vectors.shape[1]
46
+ index = faiss.IndexFlatL2(dimension)
47
+ index.add(chunk_vectors)
48
+
49
+ # For query embedding function
50
+ def embed_query(query):
51
+ return embedder.encode([query], convert_to_numpy=True)[0]
52
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english')
53
+ tfidf_matrix = tfidf_vectorizer.fit_transform(df['question'])
54
+ def retrieve_similar_answer_chunks(query, k=5):
55
+ q_vec = embed_query(query)
56
+ D, I = index.search(np.array([q_vec]).astype('float32'), k)
57
+ return [metadata[i] for i in I[0]]
58
+
59
+ def retrieve_similar_questions(query, k=5):
60
+ q_tfidf = tfidf_vectorizer.transform([query])
61
+ scores = (tfidf_matrix @ q_tfidf.T).toarray()
62
+ topk_idx = scores[:,0].argsort()[-k:][::-1]
63
+ return df.iloc[topk_idx][['topic', 'question', 'answer']].to_dict(orient='records')
64
+ def hybrid_retrieve(query, k=5):
65
+ answer_chunks = retrieve_similar_answer_chunks(query, k)
66
+ question_hits = retrieve_similar_questions(query, k)
67
+ combined_contexts = answer_chunks + question_hits
68
+
69
+ # Deduplicate if needed
70
+ seen = set()
71
+ filtered = []
72
+ for c in combined_contexts:
73
+ key = (c.get('topic'), c.get('question'))
74
+ if key not in seen:
75
+ filtered.append(c)
76
+ seen.add(key)
77
+ return filtered
78
+
79
+ def generate_answer(query,model_choice,api_key):
80
+ contexts = hybrid_retrieve(query, k=5)
81
+ context_text = "\n\n".join([f"Q: {c['question']}\nA: {c.get('answer') or c.get('answer_chunk')}" for c in contexts])
82
+
83
+ messages = [
84
+ {
85
+ "role": "system",
86
+ "content": (
87
+ "You are an expert airline assistant answering user queries based on provided context."
88
+ " Use the context to generate a helpful, factual, self-contained answer."
89
+ "If the context doesn't help & you are unable to answer - Please reply to the user to reach out to our customer service call center at 1 800 800 000 or email us at [email protected]"
90
+ "If the user is asking some random question not related to be asked to an airline assistant reply - I don't understand your question can you please rephrase"
91
+ )
92
+ },
93
+ {
94
+ "role": "user",
95
+ "content": f"Context:\n{context_text}\n\nQuestion: {query}\nAnswer:"
96
+ }
97
+ ]
98
+ openai = OpenAI(
99
+ base_url="https://generativelanguage.googleapis.com/v1beta/",
100
+ api_key=api_key,
101
+ )
102
+
103
+ response = openai.chat.completions.create(
104
+ model="gemini-1.5-flash",
105
+ messages=messages,
106
+ temperature=0.7,
107
+ max_tokens=700,
108
+ )
109
+ return response.choices[0].message.content