Penguni commited on
Commit
a57a561
·
verified ·
1 Parent(s): ce46156

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -106
app.py CHANGED
@@ -1,18 +1,21 @@
1
  import sqlite3
2
  import pandas as pd
3
  import plotly.express as px
4
- import plotly.graph_objects as go
5
  from wordcloud import WordCloud
6
  import matplotlib.pyplot as plt
7
  from collections import Counter
8
  import numpy as np
9
- import streamlit as st
10
 
11
  # Function to load data from SQLite database
12
  def load_data(db_file):
13
  conn = sqlite3.connect(db_file)
14
  return conn
15
 
 
 
 
 
16
  # Function to fetch summary info from database
17
  def fetch_summary_info(conn):
18
  # Fetch total count of movies
@@ -40,31 +43,6 @@ def fetch_summary_info(conn):
40
 
41
  return total_movies, total_years, avg_rating
42
 
43
- # Function to fetch data from database based on query
44
- def fetch_data(conn, query):
45
- return pd.read_sql_query(query, conn)
46
-
47
- # Function to fetch genre movie releases by year
48
- def fetch_genre_movie_releases(conn):
49
- query = '''
50
- SELECT startYear, genres
51
- FROM title_basics
52
- WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
53
- '''
54
- df = pd.read_sql_query(query, conn)
55
-
56
- # Split genres and explode to separate rows
57
- df['genres'] = df['genres'].str.split(',')
58
- df = df.explode('genres')
59
-
60
- # Convert startYear to numeric
61
- df['startYear'] = pd.to_numeric(df['startYear'])
62
-
63
- # Group by startYear and genre, count the number of movies
64
- genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
65
-
66
- return genre_counts
67
-
68
  # Function to plot global map of total films per region
69
  def plot_global_map(df):
70
  # Country code to name mapping
@@ -106,128 +84,125 @@ def plot_global_map(df):
106
  'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
107
  'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
108
  'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
109
- 'VE': 'Venezuela', 'VN': 'Viet Nam', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
110
  }
111
-
112
- # Mapping country codes to names
113
  df['region'] = df['region'].map(country_mapping)
114
-
115
- # Count total films per country
116
- country_counts = df['region'].value_counts().reset_index(name='total_films')
117
-
118
- # Plotting with Plotly Express
119
- fig = px.choropleth(country_counts, locations='index', locationmode='country names', color='total_films',
120
- hover_name='index', color_continuous_scale='Viridis',
 
 
 
 
 
 
 
 
121
  title='Global Map of Total Films by Country')
122
 
123
  return fig
124
 
125
- # Function to plot word cloud of top genres
126
- def plot_word_cloud(genres_list):
127
- genre_counts = Counter(genres_list)
 
 
 
 
 
 
 
 
 
 
 
128
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
129
 
130
- # Plot the word cloud
131
  plt.figure(figsize=(10, 5))
132
  plt.imshow(wordcloud, interpolation='bilinear')
133
  plt.axis('off')
134
- plt.title('Top Genres Word Cloud')
135
- return plt
136
 
137
- # Function to find best movies by genre
138
  def find_best_movies_by_genre(conn):
139
  query = '''
140
- SELECT tb.genres, tb.primaryTitle, tr.averageRating
141
  FROM title_basics tb
142
  JOIN title_ratings tr ON tb.tconst = tr.tconst
143
  WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
144
- ORDER BY tr.averageRating DESC
145
  '''
146
  df = fetch_data(conn, query)
147
-
148
- # Split genres and keep the top-rated movie for each genre
149
- genre_movie_mapping = {}
150
- for _, row in df.iterrows():
151
- genres = row['genres'].split(',')
152
- for genre in genres:
153
- if genre not in genre_movie_mapping:
154
- genre_movie_mapping[genre] = (row['primaryTitle'], row['averageRating'])
155
-
156
- # Create a DataFrame for display
157
- best_movies_by_genre = pd.DataFrame([
158
- {'Genre': genre, 'Movie': movie[0], 'Rating': movie[1]}
159
- for genre, movie in genre_movie_mapping.items()
160
- ])
161
 
162
  return best_movies_by_genre
163
 
164
- # Streamlit app
165
  def main():
166
- st.title("IMDb Movie Data Analysis")
167
-
168
  # Load data from SQLite database
169
- db_file = '/content/imdb_data/imdb_data.db' # Adjust path as needed
170
  conn = load_data(db_file)
171
 
172
- # Fetch summary info
173
  total_movies, total_years, avg_rating = fetch_summary_info(conn)
 
 
 
 
174
 
175
- # Layout for summary info
176
  col1, col2, col3 = st.columns(3)
177
  with col1:
178
  st.subheader("Total Movies")
179
- st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_movies}</h1>", unsafe_allow_html=True)
180
  with col2:
181
  st.subheader("Total Years")
182
- st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_years}</h1>", unsafe_allow_html=True)
183
  with col3:
184
  st.subheader("Average Rating")
185
- st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{avg_rating:.2f}</h1>", unsafe_allow_html=True)
186
 
187
- # Fetch best movies by genre
188
- best_movies_by_genre = find_best_movies_by_genre(conn)
 
189
 
190
- # Layout for table, global map, and word cloud
191
- st.markdown("---")
192
- row1_col1, row1_col2, row1_col3 = st.columns([1, 1, 1])
193
- with row1_col1:
194
  st.subheader("Best Movie of Each Genre")
 
195
  st.write(best_movies_by_genre)
196
- with row1_col2:
 
 
197
  st.subheader("Global Map of Total Films by Country")
198
- query_country_distribution = '''
199
- SELECT region
200
- FROM title_akas
201
- '''
202
- country_distribution = fetch_data(conn, query_country_distribution)
203
- fig = plot_global_map(country_distribution)
204
- st.plotly_chart(fig)
205
- with row1_col3:
206
  st.subheader("Word Cloud of Top Genres")
207
- query_genres = '''
208
- SELECT genres
209
- FROM title_basics
210
- WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
211
- '''
212
- genres_list = fetch_data(conn, query_genres)['genres'].str.split(',').explode().tolist()
213
- plt = plot_word_cloud(genres_list)
214
- st.pyplot(plt)
215
-
216
- # Fetch genre movie releases by year
217
- genre_counts = fetch_genre_movie_releases(conn)
218
-
219
- # Plot line chart using Plotly Express
220
- st.markdown("---")
221
- st.subheader("Genre Movie Releases by Year")
222
- fig_genre_counts = px.line(genre_counts, x='startYear', y='count', color='genres',
223
- title='Genre Movie Releases by Year',
224
- labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
225
- fig_genre_counts.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
226
- fig_genre_counts.update_xaxes(range=[2000, 2025])
227
- st.plotly_chart(fig_genre_counts)
228
 
229
  # Close database connection
230
  conn.close()
231
 
 
232
  if __name__ == '__main__':
233
- main()
 
1
  import sqlite3
2
  import pandas as pd
3
  import plotly.express as px
4
+ import streamlit as st
5
  from wordcloud import WordCloud
6
  import matplotlib.pyplot as plt
7
  from collections import Counter
8
  import numpy as np
 
9
 
10
  # Function to load data from SQLite database
11
  def load_data(db_file):
12
  conn = sqlite3.connect(db_file)
13
  return conn
14
 
15
+ # Function to fetch data from database based on query
16
+ def fetch_data(conn, query):
17
+ return pd.read_sql_query(query, conn)
18
+
19
  # Function to fetch summary info from database
20
  def fetch_summary_info(conn):
21
  # Fetch total count of movies
 
43
 
44
  return total_movies, total_years, avg_rating
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # Function to plot global map of total films per region
47
  def plot_global_map(df):
48
  # Country code to name mapping
 
84
  'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
85
  'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
86
  'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
87
+ 'VE': 'Venezuela, Bolivarian Republic of', 'VN': 'Viet Nam', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
88
  }
89
+
90
+ # Map country codes to country names
91
  df['region'] = df['region'].map(country_mapping)
92
+
93
+ # Group by country and count the number of films
94
+ df_grouped = df.groupby('region').size().reset_index(name='total_films')
95
+
96
+ # Apply log transformation to handle outliers
97
+ df_grouped['log_total_films'] = np.log10(df_grouped['total_films'] + 1)
98
+
99
+ # Plotting the global map
100
+ fig = px.choropleth(df_grouped,
101
+ locations='region',
102
+ locationmode='country names',
103
+ color='log_total_films',
104
+ hover_name='region',
105
+ color_continuous_scale=px.colors.sequential.Plasma,
106
+ labels={'log_total_films': 'Log Total Films'},
107
  title='Global Map of Total Films by Country')
108
 
109
  return fig
110
 
111
+ # Function to create word cloud of genres
112
+ def create_genre_wordcloud(conn):
113
+ query = '''
114
+ SELECT genres
115
+ FROM title_basics
116
+ WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
117
+ '''
118
+ df = fetch_data(conn, query)
119
+
120
+ # Process genres
121
+ genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
122
+ genre_counts = Counter(genres)
123
+
124
+ # Generate the word cloud
125
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
126
 
127
+ # Display the word cloud
128
  plt.figure(figsize=(10, 5))
129
  plt.imshow(wordcloud, interpolation='bilinear')
130
  plt.axis('off')
131
+ plt.title('Top Genres in IMDb Dataset')
132
+ st.pyplot(plt.gcf()) # Pass the current figure explicitly to st.pyplot()
133
 
134
+ # Function to find best movie of each genre by numVotes * averageRating
135
  def find_best_movies_by_genre(conn):
136
  query = '''
137
+ SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
138
  FROM title_basics tb
139
  JOIN title_ratings tr ON tb.tconst = tr.tconst
140
  WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
 
141
  '''
142
  df = fetch_data(conn, query)
143
+
144
+ # Split genres and select the first genre for each movie
145
+ df['genre'] = df['genres'].str.split(',', expand=True)[0]
146
+
147
+ # Calculate score based on numVotes * averageRating
148
+ df['score'] = df['numVotes'] * df['averageRating']
149
+
150
+ # Get the best movie (highest score) for each genre
151
+ idx = df.groupby('genre')['score'].idxmax()
152
+ best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
153
+ .sort_values(by='score', ascending=False).reset_index(drop=True)
 
 
 
154
 
155
  return best_movies_by_genre
156
 
157
+ # Main function to orchestrate the dashboard
158
  def main():
 
 
159
  # Load data from SQLite database
160
+ db_file = 'imdb_data/imdb_data.db' # Adjust path as needed
161
  conn = load_data(db_file)
162
 
163
+ # Fetch and display summary info
164
  total_movies, total_years, avg_rating = fetch_summary_info(conn)
165
+
166
+ # Display summary information in three columns with bold outline
167
+ st.markdown("<h1 style='text-align: center; font-size: 24px; border: 2px solid black; padding: 10px;'>IMDb Dashboard</h1>", unsafe_allow_html=True)
168
+ st.markdown("<h2 style='text-align: center; font-size: 20px;'>Summary Information</h2>", unsafe_allow_html=True)
169
 
170
+ # Layout the summary information in three columns with big bold numbers
171
  col1, col2, col3 = st.columns(3)
172
  with col1:
173
  st.subheader("Total Movies")
174
+ st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_movies}</p>", unsafe_allow_html=True)
175
  with col2:
176
  st.subheader("Total Years")
177
+ st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_years}</p>", unsafe_allow_html=True)
178
  with col3:
179
  st.subheader("Average Rating")
180
+ st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
181
 
182
+ # Create placeholders for the table and visualizations
183
+ placeholder_table = st.empty()
184
+ col1, col2 = st.columns(2)
185
 
186
+ # Best Movie Table
187
+ with placeholder_table.container():
 
 
188
  st.subheader("Best Movie of Each Genre")
189
+ best_movies_by_genre = find_best_movies_by_genre(conn)
190
  st.write(best_movies_by_genre)
191
+
192
+ # Visualizations
193
+ with col1:
194
  st.subheader("Global Map of Total Films by Country")
195
+ df_movie_region = pd.read_csv('movie_region.csv') # Replace with your actual CSV loading
196
+ fig = plot_global_map(df_movie_region)
197
+ st.plotly_chart(fig, use_container_width=True)
198
+
199
+ with col2:
 
 
 
200
  st.subheader("Word Cloud of Top Genres")
201
+ create_genre_wordcloud(conn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  # Close database connection
204
  conn.close()
205
 
206
+ # Execute the main function
207
  if __name__ == '__main__':
208
+ main()