File size: 4,645 Bytes
02c8cf9
 
89d557f
7f89231
 
02c8cf9
 
 
89d557f
02c8cf9
7f89231
a0e3257
02c8cf9
7f89231
89d557f
 
 
7f89231
89d557f
 
 
7f89231
 
 
89d557f
 
 
02c8cf9
 
 
 
 
 
 
 
 
 
 
 
 
 
e433d99
02c8cf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366b273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89d557f
 
366b273
89d557f
b839186
 
 
 
366b273
 
b839186
89d557f
 
 
 
b839186
 
 
 
 
366b273
 
7f89231
366b273
b839186
366b273
b839186
366b273
 
 
89d557f
0e41aab
 
02c8cf9
366b273
b839186
366b273
 
b839186
366b273
b839186
89d557f
366b273
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import pandas as pd
import gzip
import requests
import io
from groq import Groq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st

# Initialize the Groq client
client = Groq(api_key=os.environ.get("Groq_Api_Key"))

# Load the proprietary dataset from GitHub
def load_dataset_from_github(gzip_url):
    # Download the Gzip file
    response = requests.get(gzip_url)
    if response.status_code == 200:
        # Load the Gzip file and read the CSV
        with gzip.open(io.BytesIO(response.content), 'rt') as f:
            return pd.read_csv(f)
    else:
        raise Exception("Failed to download the dataset.")

# URL of the Gzip file containing the dataset on GitHub
gzip_url = "https://github.com/TahirSher/RAG_App_Moives_Datset/raw/main/compressed_data.csv.gz"
movies_df = load_dataset_from_github(gzip_url)

# Preprocess the dataset by creating summaries and vectors
def preprocess_data(df):
    df['summary'] = df.apply(lambda row: f"{row['title']} ({row['release_date']}): {row['overview']} "
                                         f"Genres: {row['genres']} Keywords: {row['keywords']}", axis=1)
    return df

movies_df = preprocess_data(movies_df)

# Convert summaries to TF-IDF vectors for retrieval
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(movies_df['summary'])

# Define function to retrieve similar movies based on a query
def retrieve_similar_movies(query, df, tfidf_matrix, top_n=10):
    query_vec = vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices]

# Call Groq API for generation based on the retrieved summaries and query
def generate_summary_with_groq(query, retrieved_text):
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "user", "content": f"{query}\n\nRelated information:\n{retrieved_text}"}
        ],
        model="llama3-8b-8192",
    )
    return chat_completion.choices[0].message.content

# Function to handle different types of queries
def handle_query(query):
    # Check for specific types of queries
    if "details" in query.lower():
        # Return details about the movie(s)
        movie_title = query.split("details about")[-1].strip()
        details = movies_df[movies_df['title'].str.contains(movie_title, case=False, na=False)]
        if not details.empty:
            return details.to_string(index=False)
        else:
            return "No details found for the specified movie."
    elif "list" in query.lower() and "movies" in query.lower():
        return movies_df['title'].tolist()[:10]  # Return first 10 movie titles as a simple list
    else:
        # Default to generating a summary for movie-related queries
        retrieved_movies = retrieve_similar_movies(query, movies_df, tfidf_matrix)
        retrieved_summaries = " ".join(retrieved_movies['summary'].values)
        return generate_summary_with_groq(query, retrieved_summaries)

# Streamlit Application
def main():
    st.title("Movies Analysis: RAG-based Application")
    
    # Initialize session state variables
    if 'questions' not in st.session_state:
        st.session_state.questions = []
        
    if 'responses' not in st.session_state:
        st.session_state.responses = []

    # User input
    user_query = st.text_input("Ask a question about movies:")
    
    if user_query:
        # Check if user wants to exit
        if user_query.lower() in ['exit', 'no', 'quit']:
            st.write("Exiting the application. Goodbye!")
            return
        
        # Handle the user's query
        generated_response = handle_query(user_query)
        
        # Store the question and response in session state
        st.session_state.questions.append(user_query)
        st.session_state.responses.append(generated_response)

        # Display the generated response
        st.subheader("Response:")
        st.write(generated_response)

        # Provide an option for the user to ask another question
        st.text("You can ask another question.")

        # Display the previous questions and responses
        if st.session_state.questions:
            st.write("### Previous Questions and Responses:")
            for question, response in zip(st.session_state.questions, st.session_state.responses):
                st.write(f"- **Q:** {question}")
                st.write(f"  **A:** {response}")

if __name__ == "__main__":
    main()