Spaces:

tahirsher
/

RAG_Application_Movie_Datset

Sleeping

App Files Files Community

RAG_Application_Movie_Datset / app.py

tahirsher

Update app.py

e433d99 verified 10 months ago

raw

history blame contribute delete

4.65 kB

	import os
	import pandas as pd
	import gzip
	import requests
	import io
	from groq import Groq
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import streamlit as st

	# Initialize the Groq client
	client = Groq(api_key=os.environ.get("Groq_Api_Key"))

	# Load the proprietary dataset from GitHub
	def load_dataset_from_github(gzip_url):
	# Download the Gzip file
	response = requests.get(gzip_url)
	if response.status_code == 200:
	# Load the Gzip file and read the CSV
	with gzip.open(io.BytesIO(response.content), 'rt') as f:
	return pd.read_csv(f)
	else:
	raise Exception("Failed to download the dataset.")

	# URL of the Gzip file containing the dataset on GitHub
	gzip_url = "https://github.com/TahirSher/RAG_App_Moives_Datset/raw/main/compressed_data.csv.gz"
	movies_df = load_dataset_from_github(gzip_url)

	# Preprocess the dataset by creating summaries and vectors
	def preprocess_data(df):
	df['summary'] = df.apply(lambda row: f"{row['title']} ({row['release_date']}): {row['overview']} "
	f"Genres: {row['genres']} Keywords: {row['keywords']}", axis=1)
	return df

	movies_df = preprocess_data(movies_df)

	# Convert summaries to TF-IDF vectors for retrieval
	vectorizer = TfidfVectorizer(stop_words='english')
	tfidf_matrix = vectorizer.fit_transform(movies_df['summary'])

	# Define function to retrieve similar movies based on a query
	def retrieve_similar_movies(query, df, tfidf_matrix, top_n=10):
	query_vec = vectorizer.transform([query])
	cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
	top_indices = cosine_similarities.argsort()[-top_n:][::-1]
	return df.iloc[top_indices]

	# Call Groq API for generation based on the retrieved summaries and query
	def generate_summary_with_groq(query, retrieved_text):
	chat_completion = client.chat.completions.create(
	messages=[
	{"role": "user", "content": f"{query}\n\nRelated information:\n{retrieved_text}"}
	],
	model="llama3-8b-8192",
	)
	return chat_completion.choices[0].message.content

	# Function to handle different types of queries
	def handle_query(query):
	# Check for specific types of queries
	if "details" in query.lower():
	# Return details about the movie(s)
	movie_title = query.split("details about")[-1].strip()
	details = movies_df[movies_df['title'].str.contains(movie_title, case=False, na=False)]
	if not details.empty:
	return details.to_string(index=False)
	else:
	return "No details found for the specified movie."
	elif "list" in query.lower() and "movies" in query.lower():
	return movies_df['title'].tolist()[:10] # Return first 10 movie titles as a simple list
	else:
	# Default to generating a summary for movie-related queries
	retrieved_movies = retrieve_similar_movies(query, movies_df, tfidf_matrix)
	retrieved_summaries = " ".join(retrieved_movies['summary'].values)
	return generate_summary_with_groq(query, retrieved_summaries)

	# Streamlit Application
	def main():
	st.title("Movies Analysis: RAG-based Application")

	# Initialize session state variables
	if 'questions' not in st.session_state:
	st.session_state.questions = []

	if 'responses' not in st.session_state:
	st.session_state.responses = []

	# User input
	user_query = st.text_input("Ask a question about movies:")

	if user_query:
	# Check if user wants to exit
	if user_query.lower() in ['exit', 'no', 'quit']:
	st.write("Exiting the application. Goodbye!")
	return

	# Handle the user's query
	generated_response = handle_query(user_query)

	# Store the question and response in session state
	st.session_state.questions.append(user_query)
	st.session_state.responses.append(generated_response)

	# Display the generated response
	st.subheader("Response:")
	st.write(generated_response)

	# Provide an option for the user to ask another question
	st.text("You can ask another question.")

	# Display the previous questions and responses
	if st.session_state.questions:
	st.write("### Previous Questions and Responses:")
	for question, response in zip(st.session_state.questions, st.session_state.responses):
	st.write(f"- Q: {question}")
	st.write(f" A: {response}")

	if __name__ == "__main__":
	main()