Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,21 +1,18 @@
|
|
1 |
import sqlite3
|
2 |
import pandas as pd
|
3 |
-
import streamlit as st
|
4 |
import plotly.express as px
|
|
|
5 |
from wordcloud import WordCloud
|
6 |
import matplotlib.pyplot as plt
|
7 |
from collections import Counter
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
# Function to load data from SQLite database
|
11 |
def load_data(db_file):
|
12 |
conn = sqlite3.connect(db_file)
|
13 |
return conn
|
14 |
|
15 |
-
# Function to fetch data from database based on query
|
16 |
-
def fetch_data(conn, query):
|
17 |
-
return pd.read_sql_query(query, conn)
|
18 |
-
|
19 |
# Function to fetch summary info from database
|
20 |
def fetch_summary_info(conn):
|
21 |
# Fetch total count of movies
|
@@ -43,6 +40,31 @@ def fetch_summary_info(conn):
|
|
43 |
|
44 |
return total_movies, total_years, avg_rating
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
# Function to plot global map of total films per region
|
47 |
def plot_global_map(df):
|
48 |
# Country code to name mapping
|
@@ -84,125 +106,128 @@ def plot_global_map(df):
|
|
84 |
'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
|
85 |
'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
|
86 |
'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
|
87 |
-
'VE': 'Venezuela
|
88 |
}
|
89 |
-
|
90 |
-
#
|
91 |
df['region'] = df['region'].map(country_mapping)
|
92 |
-
|
93 |
-
#
|
94 |
-
|
95 |
-
|
96 |
-
#
|
97 |
-
|
98 |
-
|
99 |
-
# Plotting the global map
|
100 |
-
fig = px.choropleth(df_grouped,
|
101 |
-
locations='region',
|
102 |
-
locationmode='country names',
|
103 |
-
color='log_total_films',
|
104 |
-
hover_name='region',
|
105 |
-
color_continuous_scale=px.colors.sequential.Plasma,
|
106 |
-
labels={'log_total_films': 'Log Total Films'},
|
107 |
title='Global Map of Total Films by Country')
|
108 |
|
109 |
return fig
|
110 |
|
111 |
-
# Function to
|
112 |
-
def
|
113 |
-
|
114 |
-
SELECT genres
|
115 |
-
FROM title_basics
|
116 |
-
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
117 |
-
'''
|
118 |
-
df = fetch_data(conn, query)
|
119 |
-
|
120 |
-
# Process genres
|
121 |
-
genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
|
122 |
-
genre_counts = Counter(genres)
|
123 |
-
|
124 |
-
# Generate the word cloud
|
125 |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
|
126 |
|
127 |
-
#
|
128 |
plt.figure(figsize=(10, 5))
|
129 |
plt.imshow(wordcloud, interpolation='bilinear')
|
130 |
plt.axis('off')
|
131 |
-
plt.title('Top Genres
|
132 |
-
|
133 |
|
134 |
-
# Function to find best
|
135 |
def find_best_movies_by_genre(conn):
|
136 |
query = '''
|
137 |
-
SELECT tb.
|
138 |
FROM title_basics tb
|
139 |
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
140 |
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
|
|
141 |
'''
|
142 |
df = fetch_data(conn, query)
|
143 |
-
|
144 |
-
# Split genres and
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
154 |
|
155 |
return best_movies_by_genre
|
156 |
|
157 |
-
#
|
158 |
def main():
|
|
|
|
|
159 |
# Load data from SQLite database
|
160 |
-
db_file = 'imdb_data.db' # Adjust path as needed
|
161 |
conn = load_data(db_file)
|
162 |
|
163 |
-
# Fetch
|
164 |
total_movies, total_years, avg_rating = fetch_summary_info(conn)
|
165 |
-
|
166 |
-
# Display summary information in three columns with bold outline
|
167 |
-
st.markdown("<h1 style='text-align: center; font-size: 24px; border: 2px solid black; padding: 10px;'>IMDb Dashboard</h1>", unsafe_allow_html=True)
|
168 |
-
st.markdown("<h2 style='text-align: center; font-size: 20px;'>Summary Information</h2>", unsafe_allow_html=True)
|
169 |
|
170 |
-
# Layout
|
171 |
col1, col2, col3 = st.columns(3)
|
172 |
with col1:
|
173 |
st.subheader("Total Movies")
|
174 |
-
st.markdown(f"<
|
175 |
with col2:
|
176 |
st.subheader("Total Years")
|
177 |
-
st.markdown(f"<
|
178 |
with col3:
|
179 |
st.subheader("Average Rating")
|
180 |
-
st.markdown(f"<
|
181 |
|
182 |
-
#
|
183 |
-
|
184 |
-
col1, col2 = st.columns(2)
|
185 |
|
186 |
-
#
|
187 |
-
|
|
|
|
|
188 |
st.subheader("Best Movie of Each Genre")
|
189 |
-
best_movies_by_genre = find_best_movies_by_genre(conn)
|
190 |
st.write(best_movies_by_genre)
|
191 |
-
|
192 |
-
# Visualizations
|
193 |
-
with col1:
|
194 |
st.subheader("Global Map of Total Films by Country")
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
200 |
st.subheader("Word Cloud of Top Genres")
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
# Close database connection
|
204 |
conn.close()
|
205 |
|
206 |
-
# Execute the main function
|
207 |
if __name__ == '__main__':
|
208 |
main()
|
|
|
1 |
import sqlite3
|
2 |
import pandas as pd
|
|
|
3 |
import plotly.express as px
|
4 |
+
import plotly.graph_objects as go
|
5 |
from wordcloud import WordCloud
|
6 |
import matplotlib.pyplot as plt
|
7 |
from collections import Counter
|
8 |
import numpy as np
|
9 |
+
import streamlit as st
|
10 |
|
11 |
# Function to load data from SQLite database
|
12 |
def load_data(db_file):
|
13 |
conn = sqlite3.connect(db_file)
|
14 |
return conn
|
15 |
|
|
|
|
|
|
|
|
|
16 |
# Function to fetch summary info from database
|
17 |
def fetch_summary_info(conn):
|
18 |
# Fetch total count of movies
|
|
|
40 |
|
41 |
return total_movies, total_years, avg_rating
|
42 |
|
43 |
+
# Function to fetch data from database based on query
|
44 |
+
def fetch_data(conn, query):
|
45 |
+
return pd.read_sql_query(query, conn)
|
46 |
+
|
47 |
+
# Function to fetch genre movie releases by year
|
48 |
+
def fetch_genre_movie_releases(conn):
|
49 |
+
query = '''
|
50 |
+
SELECT startYear, genres
|
51 |
+
FROM title_basics
|
52 |
+
WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
|
53 |
+
'''
|
54 |
+
df = pd.read_sql_query(query, conn)
|
55 |
+
|
56 |
+
# Split genres and explode to separate rows
|
57 |
+
df['genres'] = df['genres'].str.split(',')
|
58 |
+
df = df.explode('genres')
|
59 |
+
|
60 |
+
# Convert startYear to numeric
|
61 |
+
df['startYear'] = pd.to_numeric(df['startYear'])
|
62 |
+
|
63 |
+
# Group by startYear and genre, count the number of movies
|
64 |
+
genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
|
65 |
+
|
66 |
+
return genre_counts
|
67 |
+
|
68 |
# Function to plot global map of total films per region
|
69 |
def plot_global_map(df):
|
70 |
# Country code to name mapping
|
|
|
106 |
'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
|
107 |
'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
|
108 |
'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
|
109 |
+
'VE': 'Venezuela', 'VN': 'Viet Nam', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
|
110 |
}
|
111 |
+
|
112 |
+
# Mapping country codes to names
|
113 |
df['region'] = df['region'].map(country_mapping)
|
114 |
+
|
115 |
+
# Count total films per country
|
116 |
+
country_counts = df['region'].value_counts().reset_index(name='total_films')
|
117 |
+
|
118 |
+
# Plotting with Plotly Express
|
119 |
+
fig = px.choropleth(country_counts, locations='index', locationmode='country names', color='total_films',
|
120 |
+
hover_name='index', color_continuous_scale='Viridis',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
title='Global Map of Total Films by Country')
|
122 |
|
123 |
return fig
|
124 |
|
125 |
+
# Function to plot word cloud of top genres
|
126 |
+
def plot_word_cloud(genres_list):
|
127 |
+
genre_counts = Counter(genres_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
|
129 |
|
130 |
+
# Plot the word cloud
|
131 |
plt.figure(figsize=(10, 5))
|
132 |
plt.imshow(wordcloud, interpolation='bilinear')
|
133 |
plt.axis('off')
|
134 |
+
plt.title('Top Genres Word Cloud')
|
135 |
+
return plt
|
136 |
|
137 |
+
# Function to find best movies by genre
|
138 |
def find_best_movies_by_genre(conn):
|
139 |
query = '''
|
140 |
+
SELECT tb.genres, tb.primaryTitle, tr.averageRating
|
141 |
FROM title_basics tb
|
142 |
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
143 |
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
144 |
+
ORDER BY tr.averageRating DESC
|
145 |
'''
|
146 |
df = fetch_data(conn, query)
|
147 |
+
|
148 |
+
# Split genres and keep the top-rated movie for each genre
|
149 |
+
genre_movie_mapping = {}
|
150 |
+
for _, row in df.iterrows():
|
151 |
+
genres = row['genres'].split(',')
|
152 |
+
for genre in genres:
|
153 |
+
if genre not in genre_movie_mapping:
|
154 |
+
genre_movie_mapping[genre] = (row['primaryTitle'], row['averageRating'])
|
155 |
+
|
156 |
+
# Create a DataFrame for display
|
157 |
+
best_movies_by_genre = pd.DataFrame([
|
158 |
+
{'Genre': genre, 'Movie': movie[0], 'Rating': movie[1]}
|
159 |
+
for genre, movie in genre_movie_mapping.items()
|
160 |
+
])
|
161 |
|
162 |
return best_movies_by_genre
|
163 |
|
164 |
+
# Streamlit app
|
165 |
def main():
|
166 |
+
st.title("IMDb Movie Data Analysis")
|
167 |
+
|
168 |
# Load data from SQLite database
|
169 |
+
db_file = '/content/imdb_data/imdb_data.db' # Adjust path as needed
|
170 |
conn = load_data(db_file)
|
171 |
|
172 |
+
# Fetch summary info
|
173 |
total_movies, total_years, avg_rating = fetch_summary_info(conn)
|
|
|
|
|
|
|
|
|
174 |
|
175 |
+
# Layout for summary info
|
176 |
col1, col2, col3 = st.columns(3)
|
177 |
with col1:
|
178 |
st.subheader("Total Movies")
|
179 |
+
st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_movies}</h1>", unsafe_allow_html=True)
|
180 |
with col2:
|
181 |
st.subheader("Total Years")
|
182 |
+
st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_years}</h1>", unsafe_allow_html=True)
|
183 |
with col3:
|
184 |
st.subheader("Average Rating")
|
185 |
+
st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{avg_rating:.2f}</h1>", unsafe_allow_html=True)
|
186 |
|
187 |
+
# Fetch best movies by genre
|
188 |
+
best_movies_by_genre = find_best_movies_by_genre(conn)
|
|
|
189 |
|
190 |
+
# Layout for table, global map, and word cloud
|
191 |
+
st.markdown("---")
|
192 |
+
row1_col1, row1_col2, row1_col3 = st.columns([1, 1, 1])
|
193 |
+
with row1_col1:
|
194 |
st.subheader("Best Movie of Each Genre")
|
|
|
195 |
st.write(best_movies_by_genre)
|
196 |
+
with row1_col2:
|
|
|
|
|
197 |
st.subheader("Global Map of Total Films by Country")
|
198 |
+
query_country_distribution = '''
|
199 |
+
SELECT region
|
200 |
+
FROM title_akas
|
201 |
+
'''
|
202 |
+
country_distribution = fetch_data(conn, query_country_distribution)
|
203 |
+
fig = plot_global_map(country_distribution)
|
204 |
+
st.plotly_chart(fig)
|
205 |
+
with row1_col3:
|
206 |
st.subheader("Word Cloud of Top Genres")
|
207 |
+
query_genres = '''
|
208 |
+
SELECT genres
|
209 |
+
FROM title_basics
|
210 |
+
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
211 |
+
'''
|
212 |
+
genres_list = fetch_data(conn, query_genres)['genres'].str.split(',').explode().tolist()
|
213 |
+
plt = plot_word_cloud(genres_list)
|
214 |
+
st.pyplot(plt)
|
215 |
+
|
216 |
+
# Fetch genre movie releases by year
|
217 |
+
genre_counts = fetch_genre_movie_releases(conn)
|
218 |
+
|
219 |
+
# Plot line chart using Plotly Express
|
220 |
+
st.markdown("---")
|
221 |
+
st.subheader("Genre Movie Releases by Year")
|
222 |
+
fig_genre_counts = px.line(genre_counts, x='startYear', y='count', color='genres',
|
223 |
+
title='Genre Movie Releases by Year',
|
224 |
+
labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
|
225 |
+
fig_genre_counts.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
|
226 |
+
fig_genre_counts.update_xaxes(range=[2000, 2025])
|
227 |
+
st.plotly_chart(fig_genre_counts)
|
228 |
|
229 |
# Close database connection
|
230 |
conn.close()
|
231 |
|
|
|
232 |
if __name__ == '__main__':
|
233 |
main()
|