Spaces:

palbha
/

airline_faq_rag_agent

Sleeping

App Files Files Community

Palbha Kulkarni (Nazwale) commited on May 15

Commit

7f681f1

unverified ·

1 Parent(s): 4045119

Create baseline_code.py

Browse files

Files changed (1) hide show

baseline_code.py +109 -0

baseline_code.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import pandas as pd
+import openai
+df=pd.read_csv("/data/faq_data.csv")
+print(df)
+from openai import OpenAI
+from google.colab import userdata
+#Feel free to use chatgpt or other models - Google offere free gemini api hence I have used this
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+# Optional: chunk answers into smaller pieces (e.g. 200 tokens) for better retrieval
+def chunk_text(text, max_tokens=200):
+    words = text.split()
+    for i in range(0, len(words), max_tokens):
+        yield " ".join(words[i:i+max_tokens])
+chunks = []
+metadata = []
+for idx, row in df.iterrows():
+    for c in chunk_text(row['answer']):
+        chunks.append(c)
+        metadata.append({
+            "topic": row['topic'],
+            "question": row['question'],
+            "answer_chunk": c
+        })
+from sentence_transformers import SentenceTransformer
+import numpy as np
+#I am using the huggingface model to generate embeddings , If the data was large once can try & explore other embeddings
+embedder = SentenceTransformer('all-MiniLM-L6-v2')
+# Embed chunks
+chunk_vectors = embedder.encode(chunks, convert_to_numpy=True)
+# Build FAISS index as before
+import faiss
+dimension = chunk_vectors.shape[1]
+index = faiss.IndexFlatL2(dimension)
+index.add(chunk_vectors)
+# For query embedding function
+def embed_query(query):
+    return embedder.encode([query], convert_to_numpy=True)[0]
+tfidf_vectorizer = TfidfVectorizer(stop_words='english')
+tfidf_matrix = tfidf_vectorizer.fit_transform(df['question'])
+def retrieve_similar_answer_chunks(query, k=5):
+    q_vec = embed_query(query)
+    D, I = index.search(np.array([q_vec]).astype('float32'), k)
+    return [metadata[i] for i in I[0]]
+def retrieve_similar_questions(query, k=5):
+    q_tfidf = tfidf_vectorizer.transform([query])
+    scores = (tfidf_matrix @ q_tfidf.T).toarray()
+    topk_idx = scores[:,0].argsort()[-k:][::-1]
+    return df.iloc[topk_idx][['topic', 'question', 'answer']].to_dict(orient='records')
+def hybrid_retrieve(query, k=5):
+    answer_chunks = retrieve_similar_answer_chunks(query, k)
+    question_hits = retrieve_similar_questions(query, k)
+    combined_contexts = answer_chunks + question_hits
+    # Deduplicate if needed
+    seen = set()
+    filtered = []
+    for c in combined_contexts:
+        key = (c.get('topic'), c.get('question'))
+        if key not in seen:
+            filtered.append(c)
+            seen.add(key)
+    return filtered
+def generate_answer(query,model_choice,api_key):
+    contexts = hybrid_retrieve(query, k=5)
+    context_text = "\n\n".join([f"Q: {c['question']}\nA: {c.get('answer') or c.get('answer_chunk')}" for c in contexts])
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are an expert airline assistant answering user queries based on provided context."
+                " Use the context to generate a helpful, factual, self-contained answer."
+                "If the context doesn't help & you are unable to answer - Please reply to the user to reach out to our customer service call center at 1 800 800 000 or email us at [email protected]"
+                "If the user is asking some random question not related to be asked to an airline assistant reply - I don't understand your question can you please rephrase"
+            )
+        },
+        {
+            "role": "user",
+            "content": f"Context:\n{context_text}\n\nQuestion: {query}\nAnswer:"
+        }
+    ]
+    openai = OpenAI(
+       base_url="https://generativelanguage.googleapis.com/v1beta/",
+      api_key=api_key,
+    )
+    response = openai.chat.completions.create(
+        model="gemini-1.5-flash",
+        messages=messages,
+        temperature=0.7,
+        max_tokens=700,
+    )
+    return response.choices[0].message.content