Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,18 +1,21 @@
|
|
1 |
import sqlite3
|
2 |
import pandas as pd
|
3 |
import plotly.express as px
|
4 |
-
import
|
5 |
from wordcloud import WordCloud
|
6 |
import matplotlib.pyplot as plt
|
7 |
from collections import Counter
|
8 |
import numpy as np
|
9 |
-
import streamlit as st
|
10 |
|
11 |
# Function to load data from SQLite database
|
12 |
def load_data(db_file):
|
13 |
conn = sqlite3.connect(db_file)
|
14 |
return conn
|
15 |
|
|
|
|
|
|
|
|
|
16 |
# Function to fetch summary info from database
|
17 |
def fetch_summary_info(conn):
|
18 |
# Fetch total count of movies
|
@@ -40,31 +43,6 @@ def fetch_summary_info(conn):
|
|
40 |
|
41 |
return total_movies, total_years, avg_rating
|
42 |
|
43 |
-
# Function to fetch data from database based on query
|
44 |
-
def fetch_data(conn, query):
|
45 |
-
return pd.read_sql_query(query, conn)
|
46 |
-
|
47 |
-
# Function to fetch genre movie releases by year
|
48 |
-
def fetch_genre_movie_releases(conn):
|
49 |
-
query = '''
|
50 |
-
SELECT startYear, genres
|
51 |
-
FROM title_basics
|
52 |
-
WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
|
53 |
-
'''
|
54 |
-
df = pd.read_sql_query(query, conn)
|
55 |
-
|
56 |
-
# Split genres and explode to separate rows
|
57 |
-
df['genres'] = df['genres'].str.split(',')
|
58 |
-
df = df.explode('genres')
|
59 |
-
|
60 |
-
# Convert startYear to numeric
|
61 |
-
df['startYear'] = pd.to_numeric(df['startYear'])
|
62 |
-
|
63 |
-
# Group by startYear and genre, count the number of movies
|
64 |
-
genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
|
65 |
-
|
66 |
-
return genre_counts
|
67 |
-
|
68 |
# Function to plot global map of total films per region
|
69 |
def plot_global_map(df):
|
70 |
# Country code to name mapping
|
@@ -106,128 +84,125 @@ def plot_global_map(df):
|
|
106 |
'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
|
107 |
'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
|
108 |
'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
|
109 |
-
'VE': 'Venezuela', 'VN': 'Viet Nam', '
|
110 |
}
|
111 |
-
|
112 |
-
#
|
113 |
df['region'] = df['region'].map(country_mapping)
|
114 |
-
|
115 |
-
#
|
116 |
-
|
117 |
-
|
118 |
-
#
|
119 |
-
|
120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
121 |
title='Global Map of Total Films by Country')
|
122 |
|
123 |
return fig
|
124 |
|
125 |
-
# Function to
|
126 |
-
def
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
|
129 |
|
130 |
-
#
|
131 |
plt.figure(figsize=(10, 5))
|
132 |
plt.imshow(wordcloud, interpolation='bilinear')
|
133 |
plt.axis('off')
|
134 |
-
plt.title('Top Genres
|
135 |
-
|
136 |
|
137 |
-
# Function to find best
|
138 |
def find_best_movies_by_genre(conn):
|
139 |
query = '''
|
140 |
-
SELECT tb.
|
141 |
FROM title_basics tb
|
142 |
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
143 |
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
144 |
-
ORDER BY tr.averageRating DESC
|
145 |
'''
|
146 |
df = fetch_data(conn, query)
|
147 |
-
|
148 |
-
# Split genres and
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
{'Genre': genre, 'Movie': movie[0], 'Rating': movie[1]}
|
159 |
-
for genre, movie in genre_movie_mapping.items()
|
160 |
-
])
|
161 |
|
162 |
return best_movies_by_genre
|
163 |
|
164 |
-
#
|
165 |
def main():
|
166 |
-
st.title("IMDb Movie Data Analysis")
|
167 |
-
|
168 |
# Load data from SQLite database
|
169 |
-
db_file = '
|
170 |
conn = load_data(db_file)
|
171 |
|
172 |
-
# Fetch summary info
|
173 |
total_movies, total_years, avg_rating = fetch_summary_info(conn)
|
|
|
|
|
|
|
|
|
174 |
|
175 |
-
# Layout
|
176 |
col1, col2, col3 = st.columns(3)
|
177 |
with col1:
|
178 |
st.subheader("Total Movies")
|
179 |
-
st.markdown(f"<
|
180 |
with col2:
|
181 |
st.subheader("Total Years")
|
182 |
-
st.markdown(f"<
|
183 |
with col3:
|
184 |
st.subheader("Average Rating")
|
185 |
-
st.markdown(f"<
|
186 |
|
187 |
-
#
|
188 |
-
|
|
|
189 |
|
190 |
-
#
|
191 |
-
|
192 |
-
row1_col1, row1_col2, row1_col3 = st.columns([1, 1, 1])
|
193 |
-
with row1_col1:
|
194 |
st.subheader("Best Movie of Each Genre")
|
|
|
195 |
st.write(best_movies_by_genre)
|
196 |
-
|
|
|
|
|
197 |
st.subheader("Global Map of Total Films by Country")
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
fig = plot_global_map(country_distribution)
|
204 |
-
st.plotly_chart(fig)
|
205 |
-
with row1_col3:
|
206 |
st.subheader("Word Cloud of Top Genres")
|
207 |
-
|
208 |
-
SELECT genres
|
209 |
-
FROM title_basics
|
210 |
-
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
211 |
-
'''
|
212 |
-
genres_list = fetch_data(conn, query_genres)['genres'].str.split(',').explode().tolist()
|
213 |
-
plt = plot_word_cloud(genres_list)
|
214 |
-
st.pyplot(plt)
|
215 |
-
|
216 |
-
# Fetch genre movie releases by year
|
217 |
-
genre_counts = fetch_genre_movie_releases(conn)
|
218 |
-
|
219 |
-
# Plot line chart using Plotly Express
|
220 |
-
st.markdown("---")
|
221 |
-
st.subheader("Genre Movie Releases by Year")
|
222 |
-
fig_genre_counts = px.line(genre_counts, x='startYear', y='count', color='genres',
|
223 |
-
title='Genre Movie Releases by Year',
|
224 |
-
labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
|
225 |
-
fig_genre_counts.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
|
226 |
-
fig_genre_counts.update_xaxes(range=[2000, 2025])
|
227 |
-
st.plotly_chart(fig_genre_counts)
|
228 |
|
229 |
# Close database connection
|
230 |
conn.close()
|
231 |
|
|
|
232 |
if __name__ == '__main__':
|
233 |
-
main()
|
|
|
1 |
import sqlite3
|
2 |
import pandas as pd
|
3 |
import plotly.express as px
|
4 |
+
import streamlit as st
|
5 |
from wordcloud import WordCloud
|
6 |
import matplotlib.pyplot as plt
|
7 |
from collections import Counter
|
8 |
import numpy as np
|
|
|
9 |
|
10 |
# Function to load data from SQLite database
|
11 |
def load_data(db_file):
|
12 |
conn = sqlite3.connect(db_file)
|
13 |
return conn
|
14 |
|
15 |
+
# Function to fetch data from database based on query
|
16 |
+
def fetch_data(conn, query):
|
17 |
+
return pd.read_sql_query(query, conn)
|
18 |
+
|
19 |
# Function to fetch summary info from database
|
20 |
def fetch_summary_info(conn):
|
21 |
# Fetch total count of movies
|
|
|
43 |
|
44 |
return total_movies, total_years, avg_rating
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
# Function to plot global map of total films per region
|
47 |
def plot_global_map(df):
|
48 |
# Country code to name mapping
|
|
|
84 |
'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
|
85 |
'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
|
86 |
'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
|
87 |
+
'VE': 'Venezuela, Bolivarian Republic of', 'VN': 'Viet Nam', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
|
88 |
}
|
89 |
+
|
90 |
+
# Map country codes to country names
|
91 |
df['region'] = df['region'].map(country_mapping)
|
92 |
+
|
93 |
+
# Group by country and count the number of films
|
94 |
+
df_grouped = df.groupby('region').size().reset_index(name='total_films')
|
95 |
+
|
96 |
+
# Apply log transformation to handle outliers
|
97 |
+
df_grouped['log_total_films'] = np.log10(df_grouped['total_films'] + 1)
|
98 |
+
|
99 |
+
# Plotting the global map
|
100 |
+
fig = px.choropleth(df_grouped,
|
101 |
+
locations='region',
|
102 |
+
locationmode='country names',
|
103 |
+
color='log_total_films',
|
104 |
+
hover_name='region',
|
105 |
+
color_continuous_scale=px.colors.sequential.Plasma,
|
106 |
+
labels={'log_total_films': 'Log Total Films'},
|
107 |
title='Global Map of Total Films by Country')
|
108 |
|
109 |
return fig
|
110 |
|
111 |
+
# Function to create word cloud of genres
|
112 |
+
def create_genre_wordcloud(conn):
|
113 |
+
query = '''
|
114 |
+
SELECT genres
|
115 |
+
FROM title_basics
|
116 |
+
WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
|
117 |
+
'''
|
118 |
+
df = fetch_data(conn, query)
|
119 |
+
|
120 |
+
# Process genres
|
121 |
+
genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
|
122 |
+
genre_counts = Counter(genres)
|
123 |
+
|
124 |
+
# Generate the word cloud
|
125 |
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
|
126 |
|
127 |
+
# Display the word cloud
|
128 |
plt.figure(figsize=(10, 5))
|
129 |
plt.imshow(wordcloud, interpolation='bilinear')
|
130 |
plt.axis('off')
|
131 |
+
plt.title('Top Genres in IMDb Dataset')
|
132 |
+
st.pyplot(plt.gcf()) # Pass the current figure explicitly to st.pyplot()
|
133 |
|
134 |
+
# Function to find best movie of each genre by numVotes * averageRating
|
135 |
def find_best_movies_by_genre(conn):
|
136 |
query = '''
|
137 |
+
SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
|
138 |
FROM title_basics tb
|
139 |
JOIN title_ratings tr ON tb.tconst = tr.tconst
|
140 |
WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
|
|
|
141 |
'''
|
142 |
df = fetch_data(conn, query)
|
143 |
+
|
144 |
+
# Split genres and select the first genre for each movie
|
145 |
+
df['genre'] = df['genres'].str.split(',', expand=True)[0]
|
146 |
+
|
147 |
+
# Calculate score based on numVotes * averageRating
|
148 |
+
df['score'] = df['numVotes'] * df['averageRating']
|
149 |
+
|
150 |
+
# Get the best movie (highest score) for each genre
|
151 |
+
idx = df.groupby('genre')['score'].idxmax()
|
152 |
+
best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
|
153 |
+
.sort_values(by='score', ascending=False).reset_index(drop=True)
|
|
|
|
|
|
|
154 |
|
155 |
return best_movies_by_genre
|
156 |
|
157 |
+
# Main function to orchestrate the dashboard
|
158 |
def main():
|
|
|
|
|
159 |
# Load data from SQLite database
|
160 |
+
db_file = 'imdb_data/imdb_data.db' # Adjust path as needed
|
161 |
conn = load_data(db_file)
|
162 |
|
163 |
+
# Fetch and display summary info
|
164 |
total_movies, total_years, avg_rating = fetch_summary_info(conn)
|
165 |
+
|
166 |
+
# Display summary information in three columns with bold outline
|
167 |
+
st.markdown("<h1 style='text-align: center; font-size: 24px; border: 2px solid black; padding: 10px;'>IMDb Dashboard</h1>", unsafe_allow_html=True)
|
168 |
+
st.markdown("<h2 style='text-align: center; font-size: 20px;'>Summary Information</h2>", unsafe_allow_html=True)
|
169 |
|
170 |
+
# Layout the summary information in three columns with big bold numbers
|
171 |
col1, col2, col3 = st.columns(3)
|
172 |
with col1:
|
173 |
st.subheader("Total Movies")
|
174 |
+
st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_movies}</p>", unsafe_allow_html=True)
|
175 |
with col2:
|
176 |
st.subheader("Total Years")
|
177 |
+
st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_years}</p>", unsafe_allow_html=True)
|
178 |
with col3:
|
179 |
st.subheader("Average Rating")
|
180 |
+
st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
|
181 |
|
182 |
+
# Create placeholders for the table and visualizations
|
183 |
+
placeholder_table = st.empty()
|
184 |
+
col1, col2 = st.columns(2)
|
185 |
|
186 |
+
# Best Movie Table
|
187 |
+
with placeholder_table.container():
|
|
|
|
|
188 |
st.subheader("Best Movie of Each Genre")
|
189 |
+
best_movies_by_genre = find_best_movies_by_genre(conn)
|
190 |
st.write(best_movies_by_genre)
|
191 |
+
|
192 |
+
# Visualizations
|
193 |
+
with col1:
|
194 |
st.subheader("Global Map of Total Films by Country")
|
195 |
+
df_movie_region = pd.read_csv('movie_region.csv') # Replace with your actual CSV loading
|
196 |
+
fig = plot_global_map(df_movie_region)
|
197 |
+
st.plotly_chart(fig, use_container_width=True)
|
198 |
+
|
199 |
+
with col2:
|
|
|
|
|
|
|
200 |
st.subheader("Word Cloud of Top Genres")
|
201 |
+
create_genre_wordcloud(conn)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
203 |
# Close database connection
|
204 |
conn.close()
|
205 |
|
206 |
+
# Execute the main function
|
207 |
if __name__ == '__main__':
|
208 |
+
main()
|