tahirsher's picture
Update app.py
e433d99 verified
import os
import pandas as pd
import gzip
import requests
import io
from groq import Groq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
# Initialize the Groq client
client = Groq(api_key=os.environ.get("Groq_Api_Key"))
# Load the proprietary dataset from GitHub
def load_dataset_from_github(gzip_url):
# Download the Gzip file
response = requests.get(gzip_url)
if response.status_code == 200:
# Load the Gzip file and read the CSV
with gzip.open(io.BytesIO(response.content), 'rt') as f:
return pd.read_csv(f)
else:
raise Exception("Failed to download the dataset.")
# URL of the Gzip file containing the dataset on GitHub
gzip_url = "https://github.com/TahirSher/RAG_App_Moives_Datset/raw/main/compressed_data.csv.gz"
movies_df = load_dataset_from_github(gzip_url)
# Preprocess the dataset by creating summaries and vectors
def preprocess_data(df):
df['summary'] = df.apply(lambda row: f"{row['title']} ({row['release_date']}): {row['overview']} "
f"Genres: {row['genres']} Keywords: {row['keywords']}", axis=1)
return df
movies_df = preprocess_data(movies_df)
# Convert summaries to TF-IDF vectors for retrieval
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies_df['summary'])
# Define function to retrieve similar movies based on a query
def retrieve_similar_movies(query, df, tfidf_matrix, top_n=10):
query_vec = vectorizer.transform([query])
cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
top_indices = cosine_similarities.argsort()[-top_n:][::-1]
return df.iloc[top_indices]
# Call Groq API for generation based on the retrieved summaries and query
def generate_summary_with_groq(query, retrieved_text):
chat_completion = client.chat.completions.create(
messages=[
{"role": "user", "content": f"{query}\n\nRelated information:\n{retrieved_text}"}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Function to handle different types of queries
def handle_query(query):
# Check for specific types of queries
if "details" in query.lower():
# Return details about the movie(s)
movie_title = query.split("details about")[-1].strip()
details = movies_df[movies_df['title'].str.contains(movie_title, case=False, na=False)]
if not details.empty:
return details.to_string(index=False)
else:
return "No details found for the specified movie."
elif "list" in query.lower() and "movies" in query.lower():
return movies_df['title'].tolist()[:10] # Return first 10 movie titles as a simple list
else:
# Default to generating a summary for movie-related queries
retrieved_movies = retrieve_similar_movies(query, movies_df, tfidf_matrix)
retrieved_summaries = " ".join(retrieved_movies['summary'].values)
return generate_summary_with_groq(query, retrieved_summaries)
# Streamlit Application
def main():
st.title("Movies Analysis: RAG-based Application")
# Initialize session state variables
if 'questions' not in st.session_state:
st.session_state.questions = []
if 'responses' not in st.session_state:
st.session_state.responses = []
# User input
user_query = st.text_input("Ask a question about movies:")
if user_query:
# Check if user wants to exit
if user_query.lower() in ['exit', 'no', 'quit']:
st.write("Exiting the application. Goodbye!")
return
# Handle the user's query
generated_response = handle_query(user_query)
# Store the question and response in session state
st.session_state.questions.append(user_query)
st.session_state.responses.append(generated_response)
# Display the generated response
st.subheader("Response:")
st.write(generated_response)
# Provide an option for the user to ask another question
st.text("You can ask another question.")
# Display the previous questions and responses
if st.session_state.questions:
st.write("### Previous Questions and Responses:")
for question, response in zip(st.session_state.questions, st.session_state.responses):
st.write(f"- **Q:** {question}")
st.write(f" **A:** {response}")
if __name__ == "__main__":
main()