Penguni commited on
Commit
ce46156
·
verified ·
1 Parent(s): c562e03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -80
app.py CHANGED
@@ -1,21 +1,18 @@
1
  import sqlite3
2
  import pandas as pd
3
- import streamlit as st
4
  import plotly.express as px
 
5
  from wordcloud import WordCloud
6
  import matplotlib.pyplot as plt
7
  from collections import Counter
8
  import numpy as np
 
9
 
10
  # Function to load data from SQLite database
11
  def load_data(db_file):
12
  conn = sqlite3.connect(db_file)
13
  return conn
14
 
15
- # Function to fetch data from database based on query
16
- def fetch_data(conn, query):
17
- return pd.read_sql_query(query, conn)
18
-
19
  # Function to fetch summary info from database
20
  def fetch_summary_info(conn):
21
  # Fetch total count of movies
@@ -43,6 +40,31 @@ def fetch_summary_info(conn):
43
 
44
  return total_movies, total_years, avg_rating
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  # Function to plot global map of total films per region
47
  def plot_global_map(df):
48
  # Country code to name mapping
@@ -84,125 +106,128 @@ def plot_global_map(df):
84
  'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
85
  'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
86
  'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
87
- 'VE': 'Venezuela, Bolivarian Republic of', 'VN': 'Viet Nam', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
88
  }
89
-
90
- # Map country codes to country names
91
  df['region'] = df['region'].map(country_mapping)
92
-
93
- # Group by country and count the number of films
94
- df_grouped = df.groupby('region').size().reset_index(name='total_films')
95
-
96
- # Apply log transformation to handle outliers
97
- df_grouped['log_total_films'] = np.log10(df_grouped['total_films'] + 1)
98
-
99
- # Plotting the global map
100
- fig = px.choropleth(df_grouped,
101
- locations='region',
102
- locationmode='country names',
103
- color='log_total_films',
104
- hover_name='region',
105
- color_continuous_scale=px.colors.sequential.Plasma,
106
- labels={'log_total_films': 'Log Total Films'},
107
  title='Global Map of Total Films by Country')
108
 
109
  return fig
110
 
111
- # Function to create word cloud of genres
112
- def create_genre_wordcloud(conn):
113
- query = '''
114
- SELECT genres
115
- FROM title_basics
116
- WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
117
- '''
118
- df = fetch_data(conn, query)
119
-
120
- # Process genres
121
- genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
122
- genre_counts = Counter(genres)
123
-
124
- # Generate the word cloud
125
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
126
 
127
- # Display the word cloud
128
  plt.figure(figsize=(10, 5))
129
  plt.imshow(wordcloud, interpolation='bilinear')
130
  plt.axis('off')
131
- plt.title('Top Genres in IMDb Dataset')
132
- st.pyplot(plt.gcf()) # Pass the current figure explicitly to st.pyplot()
133
 
134
- # Function to find best movie of each genre by numVotes * averageRating
135
  def find_best_movies_by_genre(conn):
136
  query = '''
137
- SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
138
  FROM title_basics tb
139
  JOIN title_ratings tr ON tb.tconst = tr.tconst
140
  WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
 
141
  '''
142
  df = fetch_data(conn, query)
143
-
144
- # Split genres and select the first genre for each movie
145
- df['genre'] = df['genres'].str.split(',', expand=True)[0]
146
-
147
- # Calculate score based on numVotes * averageRating
148
- df['score'] = df['numVotes'] * df['averageRating']
149
-
150
- # Get the best movie (highest score) for each genre
151
- idx = df.groupby('genre')['score'].idxmax()
152
- best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
153
- .sort_values(by='score', ascending=False).reset_index(drop=True)
 
 
 
154
 
155
  return best_movies_by_genre
156
 
157
- # Main function to orchestrate the dashboard
158
  def main():
 
 
159
  # Load data from SQLite database
160
- db_file = 'imdb_data.db' # Adjust path as needed
161
  conn = load_data(db_file)
162
 
163
- # Fetch and display summary info
164
  total_movies, total_years, avg_rating = fetch_summary_info(conn)
165
-
166
- # Display summary information in three columns with bold outline
167
- st.markdown("<h1 style='text-align: center; font-size: 24px; border: 2px solid black; padding: 10px;'>IMDb Dashboard</h1>", unsafe_allow_html=True)
168
- st.markdown("<h2 style='text-align: center; font-size: 20px;'>Summary Information</h2>", unsafe_allow_html=True)
169
 
170
- # Layout the summary information in three columns with big bold numbers
171
  col1, col2, col3 = st.columns(3)
172
  with col1:
173
  st.subheader("Total Movies")
174
- st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_movies}</p>", unsafe_allow_html=True)
175
  with col2:
176
  st.subheader("Total Years")
177
- st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{total_years}</p>", unsafe_allow_html=True)
178
  with col3:
179
  st.subheader("Average Rating")
180
- st.markdown(f"<p style='text-align: center; font-size: 18px; font-weight: bold;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
181
 
182
- # Create placeholders for the table and visualizations
183
- placeholder_table = st.empty()
184
- col1, col2 = st.columns(2)
185
 
186
- # Best Movie Table
187
- with placeholder_table.container():
 
 
188
  st.subheader("Best Movie of Each Genre")
189
- best_movies_by_genre = find_best_movies_by_genre(conn)
190
  st.write(best_movies_by_genre)
191
-
192
- # Visualizations
193
- with col1:
194
  st.subheader("Global Map of Total Films by Country")
195
- df_movie_region = pd.read_csv('movie_region.csv') # Replace with your actual CSV loading
196
- fig = plot_global_map(df_movie_region)
197
- st.plotly_chart(fig, use_container_width=True)
198
-
199
- with col2:
 
 
 
200
  st.subheader("Word Cloud of Top Genres")
201
- create_genre_wordcloud(conn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
  # Close database connection
204
  conn.close()
205
 
206
- # Execute the main function
207
  if __name__ == '__main__':
208
  main()
 
1
  import sqlite3
2
  import pandas as pd
 
3
  import plotly.express as px
4
+ import plotly.graph_objects as go
5
  from wordcloud import WordCloud
6
  import matplotlib.pyplot as plt
7
  from collections import Counter
8
  import numpy as np
9
+ import streamlit as st
10
 
11
  # Function to load data from SQLite database
12
  def load_data(db_file):
13
  conn = sqlite3.connect(db_file)
14
  return conn
15
 
 
 
 
 
16
  # Function to fetch summary info from database
17
  def fetch_summary_info(conn):
18
  # Fetch total count of movies
 
40
 
41
  return total_movies, total_years, avg_rating
42
 
43
+ # Function to fetch data from database based on query
44
+ def fetch_data(conn, query):
45
+ return pd.read_sql_query(query, conn)
46
+
47
+ # Function to fetch genre movie releases by year
48
+ def fetch_genre_movie_releases(conn):
49
+ query = '''
50
+ SELECT startYear, genres
51
+ FROM title_basics
52
+ WHERE titleType = 'movie' AND startYear != '\\N' AND genres != '\\N'
53
+ '''
54
+ df = pd.read_sql_query(query, conn)
55
+
56
+ # Split genres and explode to separate rows
57
+ df['genres'] = df['genres'].str.split(',')
58
+ df = df.explode('genres')
59
+
60
+ # Convert startYear to numeric
61
+ df['startYear'] = pd.to_numeric(df['startYear'])
62
+
63
+ # Group by startYear and genre, count the number of movies
64
+ genre_counts = df.groupby(['startYear', 'genres']).size().reset_index(name='count')
65
+
66
+ return genre_counts
67
+
68
  # Function to plot global map of total films per region
69
  def plot_global_map(df):
70
  # Country code to name mapping
 
106
  'TL': 'Timor-Leste', 'TG': 'Togo', 'TO': 'Tonga', 'TT': 'Trinidad and Tobago', 'TN': 'Tunisia',
107
  'TR': 'Turkey', 'TM': 'Turkmenistan', 'UG': 'Uganda', 'UA': 'Ukraine', 'AE': 'United Arab Emirates',
108
  'GB': 'United Kingdom', 'US': 'United States', 'UY': 'Uruguay', 'UZ': 'Uzbekistan', 'VU': 'Vanuatu',
109
+ 'VE': 'Venezuela', 'VN': 'Viet Nam', 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe'
110
  }
111
+
112
+ # Mapping country codes to names
113
  df['region'] = df['region'].map(country_mapping)
114
+
115
+ # Count total films per country
116
+ country_counts = df['region'].value_counts().reset_index(name='total_films')
117
+
118
+ # Plotting with Plotly Express
119
+ fig = px.choropleth(country_counts, locations='index', locationmode='country names', color='total_films',
120
+ hover_name='index', color_continuous_scale='Viridis',
 
 
 
 
 
 
 
 
121
  title='Global Map of Total Films by Country')
122
 
123
  return fig
124
 
125
+ # Function to plot word cloud of top genres
126
+ def plot_word_cloud(genres_list):
127
+ genre_counts = Counter(genres_list)
 
 
 
 
 
 
 
 
 
 
 
128
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
129
 
130
+ # Plot the word cloud
131
  plt.figure(figsize=(10, 5))
132
  plt.imshow(wordcloud, interpolation='bilinear')
133
  plt.axis('off')
134
+ plt.title('Top Genres Word Cloud')
135
+ return plt
136
 
137
+ # Function to find best movies by genre
138
  def find_best_movies_by_genre(conn):
139
  query = '''
140
+ SELECT tb.genres, tb.primaryTitle, tr.averageRating
141
  FROM title_basics tb
142
  JOIN title_ratings tr ON tb.tconst = tr.tconst
143
  WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
144
+ ORDER BY tr.averageRating DESC
145
  '''
146
  df = fetch_data(conn, query)
147
+
148
+ # Split genres and keep the top-rated movie for each genre
149
+ genre_movie_mapping = {}
150
+ for _, row in df.iterrows():
151
+ genres = row['genres'].split(',')
152
+ for genre in genres:
153
+ if genre not in genre_movie_mapping:
154
+ genre_movie_mapping[genre] = (row['primaryTitle'], row['averageRating'])
155
+
156
+ # Create a DataFrame for display
157
+ best_movies_by_genre = pd.DataFrame([
158
+ {'Genre': genre, 'Movie': movie[0], 'Rating': movie[1]}
159
+ for genre, movie in genre_movie_mapping.items()
160
+ ])
161
 
162
  return best_movies_by_genre
163
 
164
+ # Streamlit app
165
  def main():
166
+ st.title("IMDb Movie Data Analysis")
167
+
168
  # Load data from SQLite database
169
+ db_file = '/content/imdb_data/imdb_data.db' # Adjust path as needed
170
  conn = load_data(db_file)
171
 
172
+ # Fetch summary info
173
  total_movies, total_years, avg_rating = fetch_summary_info(conn)
 
 
 
 
174
 
175
+ # Layout for summary info
176
  col1, col2, col3 = st.columns(3)
177
  with col1:
178
  st.subheader("Total Movies")
179
+ st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_movies}</h1>", unsafe_allow_html=True)
180
  with col2:
181
  st.subheader("Total Years")
182
+ st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{total_years}</h1>", unsafe_allow_html=True)
183
  with col3:
184
  st.subheader("Average Rating")
185
+ st.markdown(f"<h1 style='text-align: center; font-size: 48px;'>{avg_rating:.2f}</h1>", unsafe_allow_html=True)
186
 
187
+ # Fetch best movies by genre
188
+ best_movies_by_genre = find_best_movies_by_genre(conn)
 
189
 
190
+ # Layout for table, global map, and word cloud
191
+ st.markdown("---")
192
+ row1_col1, row1_col2, row1_col3 = st.columns([1, 1, 1])
193
+ with row1_col1:
194
  st.subheader("Best Movie of Each Genre")
 
195
  st.write(best_movies_by_genre)
196
+ with row1_col2:
 
 
197
  st.subheader("Global Map of Total Films by Country")
198
+ query_country_distribution = '''
199
+ SELECT region
200
+ FROM title_akas
201
+ '''
202
+ country_distribution = fetch_data(conn, query_country_distribution)
203
+ fig = plot_global_map(country_distribution)
204
+ st.plotly_chart(fig)
205
+ with row1_col3:
206
  st.subheader("Word Cloud of Top Genres")
207
+ query_genres = '''
208
+ SELECT genres
209
+ FROM title_basics
210
+ WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
211
+ '''
212
+ genres_list = fetch_data(conn, query_genres)['genres'].str.split(',').explode().tolist()
213
+ plt = plot_word_cloud(genres_list)
214
+ st.pyplot(plt)
215
+
216
+ # Fetch genre movie releases by year
217
+ genre_counts = fetch_genre_movie_releases(conn)
218
+
219
+ # Plot line chart using Plotly Express
220
+ st.markdown("---")
221
+ st.subheader("Genre Movie Releases by Year")
222
+ fig_genre_counts = px.line(genre_counts, x='startYear', y='count', color='genres',
223
+ title='Genre Movie Releases by Year',
224
+ labels={'startYear': 'Year', 'count': 'Number of Movies', 'genres': 'Genre'})
225
+ fig_genre_counts.update_layout(xaxis_tickmode='linear') # Ensure x-axis ticks are shown in a linear manner
226
+ fig_genre_counts.update_xaxes(range=[2000, 2025])
227
+ st.plotly_chart(fig_genre_counts)
228
 
229
  # Close database connection
230
  conn.close()
231
 
 
232
  if __name__ == '__main__':
233
  main()