Spaces:

tahirsher
/

RAG_Application_Movie_Datset

Sleeping

App Files Files Community

tahirsher commited on Oct 29, 2024

Commit

89d557f

verified ·

1 Parent(s): a2787fb

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -34

app.py CHANGED Viewed

@@ -1,36 +1,33 @@
 import os
 import pandas as pd
-import zipfile
 import requests
 import io
 from groq import Groq
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
 # Initialize the Groq client
-client = Groq(api_key=os.environ.get("Groq_Api_Key"))
 # Load the proprietary dataset from GitHub
-def load_dataset_from_github(zip_url):
-    # Download the zip file
-    response = requests.get(zip_url)
     if response.status_code == 200:
-        # Extract the zip file
-        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
-            z.extractall("dataset")  # Extract to the 'dataset' folder
-        # Load the CSV file (assuming it's named 'movie_dataset.csv' inside the zip)
-        return pd.read_csv("dataset/movie_dataset.csv")
     else:
         raise Exception("Failed to download the dataset.")
-# URL of the ZIP file containing the dataset on GitHub
-zip_url = "https://github.com/TahirSher/RAG_App_Moives_Datset/blob/main/compressed_data.csv.gz"
-movies_df = load_dataset_from_github(zip_url)
 # Preprocess the dataset by creating summaries and vectors
 def preprocess_data(df):
-    # Combine relevant text columns to form a concise summary for each movie
     df['summary'] = df.apply(lambda row: f"{row['title']} ({row['release_date']}): {row['overview']} "
                                          f"Genres: {row['genres']} Keywords: {row['keywords']}", axis=1)
     return df
@@ -58,30 +55,28 @@ def generate_summary_with_groq(query, retrieved_text):
     )
     return chat_completion.choices[0].message.content
-# Main interactive loop
-def rag_application():
-    print("Welcome to the Movie RAG-based Application!")
-    while True:
-        # Prompt user for a query
-        user_query = input("Ask a question about movies or type 'exit' to quit: ")
-        if user_query.lower() in ['exit', 'no', 'quit']:
-            print("Exiting the application. Goodbye!")
-            break
         # Retrieve relevant movie summaries
         retrieved_movies = retrieve_similar_movies(user_query, movies_df, tfidf_matrix)
         retrieved_summaries = " ".join(retrieved_movies['summary'].values)
         # Generate a summary response based on retrieved movies
         generated_summary = generate_summary_with_groq(user_query, retrieved_summaries)
-        print("Generated Summary:", generated_summary)
-        # Ask if user wants to continue or exit
-        continue_query = input("Do you have another question? (yes/no): ")
-        if continue_query.lower() != 'yes':
-            print("Exiting the application. Goodbye!")
-            break
-# Run the application
-rag_application()

 import os
 import pandas as pd
+import gzip
 import requests
 import io
 from groq import Groq
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+import streamlit as st
 # Initialize the Groq client
+client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 # Load the proprietary dataset from GitHub
+def load_dataset_from_github(gzip_url):
+    # Download the Gzip file
+    response = requests.get(gzip_url)
     if response.status_code == 200:
+        # Load the Gzip file and read the CSV
+        with gzip.open(io.BytesIO(response.content), 'rt') as f:
+            return pd.read_csv(f)
     else:
         raise Exception("Failed to download the dataset.")
+# URL of the Gzip file containing the dataset on GitHub
+gzip_url = "https://github.com/TahirSher/RAG_App_Moives_Datset/raw/main/compressed_data.csv.gz"
+movies_df = load_dataset_from_github(gzip_url)
 # Preprocess the dataset by creating summaries and vectors
 def preprocess_data(df):
     df['summary'] = df.apply(lambda row: f"{row['title']} ({row['release_date']}): {row['overview']} "
                                          f"Genres: {row['genres']} Keywords: {row['keywords']}", axis=1)
     return df
     )
     return chat_completion.choices[0].message.content
+# Streamlit Application
+def main():
+    st.title("Movie RAG-based Application")
+    # User input
+    user_query = st.text_input("Ask a question about movies:")
+    if user_query:
         # Retrieve relevant movie summaries
         retrieved_movies = retrieve_similar_movies(user_query, movies_df, tfidf_matrix)
         retrieved_summaries = " ".join(retrieved_movies['summary'].values)
         # Generate a summary response based on retrieved movies
         generated_summary = generate_summary_with_groq(user_query, retrieved_summaries)
+        # Display the generated summary
+        st.subheader("Generated Summary:")
+        st.write(generated_summary)
+        # Option to ask another question
+        if st.button("Ask another question"):
+            st.experimental_rerun()
+if __name__ == "__main__":
+    main()