|
import os |
|
import pandas as pd |
|
import gzip |
|
import requests |
|
import io |
|
from groq import Groq |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import streamlit as st |
|
|
|
|
|
client = Groq(api_key=os.environ.get("Groq_Api_Key")) |
|
|
|
|
|
def load_dataset_from_github(gzip_url): |
|
|
|
response = requests.get(gzip_url) |
|
if response.status_code == 200: |
|
|
|
with gzip.open(io.BytesIO(response.content), 'rt') as f: |
|
return pd.read_csv(f) |
|
else: |
|
raise Exception("Failed to download the dataset.") |
|
|
|
|
|
gzip_url = "https://github.com/TahirSher/RAG_App_Moives_Datset/raw/main/compressed_data.csv.gz" |
|
movies_df = load_dataset_from_github(gzip_url) |
|
|
|
|
|
def preprocess_data(df): |
|
df['summary'] = df.apply(lambda row: f"{row['title']} ({row['release_date']}): {row['overview']} " |
|
f"Genres: {row['genres']} Keywords: {row['keywords']}", axis=1) |
|
return df |
|
|
|
movies_df = preprocess_data(movies_df) |
|
|
|
|
|
vectorizer = TfidfVectorizer(stop_words='english') |
|
tfidf_matrix = vectorizer.fit_transform(movies_df['summary']) |
|
|
|
|
|
def retrieve_similar_movies(query, df, tfidf_matrix, top_n=10): |
|
query_vec = vectorizer.transform([query]) |
|
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten() |
|
top_indices = cosine_similarities.argsort()[-top_n:][::-1] |
|
return df.iloc[top_indices] |
|
|
|
|
|
def generate_summary_with_groq(query, retrieved_text): |
|
chat_completion = client.chat.completions.create( |
|
messages=[ |
|
{"role": "user", "content": f"{query}\n\nRelated information:\n{retrieved_text}"} |
|
], |
|
model="llama3-8b-8192", |
|
) |
|
return chat_completion.choices[0].message.content |
|
|
|
|
|
def handle_query(query): |
|
|
|
if "details" in query.lower(): |
|
|
|
movie_title = query.split("details about")[-1].strip() |
|
details = movies_df[movies_df['title'].str.contains(movie_title, case=False, na=False)] |
|
if not details.empty: |
|
return details.to_string(index=False) |
|
else: |
|
return "No details found for the specified movie." |
|
elif "list" in query.lower() and "movies" in query.lower(): |
|
return movies_df['title'].tolist()[:10] |
|
else: |
|
|
|
retrieved_movies = retrieve_similar_movies(query, movies_df, tfidf_matrix) |
|
retrieved_summaries = " ".join(retrieved_movies['summary'].values) |
|
return generate_summary_with_groq(query, retrieved_summaries) |
|
|
|
|
|
def main(): |
|
st.title("Movies Analysis: RAG-based Application") |
|
|
|
|
|
if 'questions' not in st.session_state: |
|
st.session_state.questions = [] |
|
|
|
if 'responses' not in st.session_state: |
|
st.session_state.responses = [] |
|
|
|
|
|
user_query = st.text_input("Ask a question about movies:") |
|
|
|
if user_query: |
|
|
|
if user_query.lower() in ['exit', 'no', 'quit']: |
|
st.write("Exiting the application. Goodbye!") |
|
return |
|
|
|
|
|
generated_response = handle_query(user_query) |
|
|
|
|
|
st.session_state.questions.append(user_query) |
|
st.session_state.responses.append(generated_response) |
|
|
|
|
|
st.subheader("Response:") |
|
st.write(generated_response) |
|
|
|
|
|
st.text("You can ask another question.") |
|
|
|
|
|
if st.session_state.questions: |
|
st.write("### Previous Questions and Responses:") |
|
for question, response in zip(st.session_state.questions, st.session_state.responses): |
|
st.write(f"- **Q:** {question}") |
|
st.write(f" **A:** {response}") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|