Penguni commited on
Commit
99f87a3
·
verified ·
1 Parent(s): 8057ddd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +113 -88
app.py CHANGED
@@ -1,51 +1,84 @@
1
  import sqlite3
2
  import pandas as pd
3
- import streamlit as st
4
  from wordcloud import WordCloud
5
  import matplotlib.pyplot as plt
6
- import plotly.express as px
 
 
7
 
8
  # Function to load data from SQLite database
 
9
  def load_data(db_file):
10
  conn = sqlite3.connect(db_file)
11
  return conn
12
 
13
  # Function to fetch data from database based on query
 
14
  def fetch_data(conn, query):
15
  return pd.read_sql_query(query, conn)
16
 
17
- # Function to calculate total movie releases by year
18
- def total_movie_releases_by_year(conn):
19
- query = '''
20
- SELECT startYear, COUNT(*) as total_movies
 
 
21
  FROM title_basics
22
- WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\\N'
23
- GROUP BY startYear
24
- ORDER BY startYear
25
  '''
26
- df = fetch_data(conn, query)
27
- return df
28
 
29
- # Function to calculate total count of years
30
- def total_count_of_years(conn):
31
- query = '''
32
  SELECT COUNT(DISTINCT startYear) as total_years
33
  FROM title_basics
34
  WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\\N'
35
  '''
36
- df = fetch_data(conn, query)
37
- return df
38
 
39
- # Function to calculate average rating of movies
40
- def average_rating_of_movies(conn):
41
- query = '''
42
  SELECT AVG(averageRating) as avg_rating
43
  FROM title_ratings
44
  '''
45
- df = fetch_data(conn, query)
46
- return df
47
 
48
- # Function to extract genres and create a word cloud
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  def create_genre_wordcloud(conn):
50
  query = '''
51
  SELECT genres
@@ -53,85 +86,77 @@ def create_genre_wordcloud(conn):
53
  WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
54
  '''
55
  df = fetch_data(conn, query)
 
 
 
 
 
 
 
 
 
 
56
 
57
- # Combine all genres into a single string
58
- all_genres = ','.join(df['genres']).replace(',', ' ')
59
-
60
- # Generate word cloud
61
- wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_genres)
62
-
63
- # Plot word cloud
64
- plt.figure(figsize=(10, 5))
65
- plt.imshow(wordcloud, interpolation='bilinear')
66
- plt.axis('off')
67
- plt.tight_layout()
68
-
69
- # Save word cloud to a temporary file
70
- temp_file = '/tmp/wordcloud.png'
71
- plt.savefig(temp_file)
72
-
73
- return temp_file
74
-
75
- # Function to plot globe map using Plotly
76
- def plot_globe_map(df):
77
- fig = px.scatter_geo(df, lat='latitude', lon='longitude', hover_name='primaryTitle',
78
- color='country', size='runtimeMinutes', size_max=20,
79
- projection='natural earth')
80
- fig.update_geos(showland=True, landcolor='rgb(217, 217, 217)',
81
- countrycolor='rgb(0, 0, 0)')
82
- fig.update_layout(title='Movie Locations on Globe',
83
- margin={"r":0,"t":0,"l":0,"b":0})
84
- return fig
85
 
86
- # Main function to run the Streamlit app
87
- def main():
88
- st.title('Movie Locations Dashboard')
89
 
90
- # Show globe map of movie locations
91
- st.subheader('Movie Locations on Globe')
92
- fig = plot_globe_map(df) # Assuming df is already loaded with CSV data
93
- st.plotly_chart(fig)
 
 
 
 
 
 
94
 
95
- st.title('IMDb Movie Dashboard')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Load data from SQLite database
98
  db_file = 'imdb_data.db' # Adjust path as needed
99
  conn = load_data(db_file)
100
 
101
- # Calculate total count of years
102
- df_total_years = total_count_of_years(conn)
103
- total_years = df_total_years.iloc[0]['total_years']
 
 
104
 
105
- # Calculate total movie releases by year
106
- df_total_movies = total_movie_releases_by_year(conn)
107
- total_movies = df_total_movies['total_movies'].sum()
 
108
 
109
- # Calculate average rating of movies
110
- df_avg_rating = average_rating_of_movies(conn)
111
- avg_rating = df_avg_rating.iloc[0]['avg_rating']
112
 
113
- # Generate word cloud of genres
114
- wordcloud_file = create_genre_wordcloud(conn)
 
 
115
 
116
  # Close database connection
117
  conn.close()
118
 
119
- # Display total count of years, total movie releases, and average rating in three columns
120
- col1, col2, col3 = st.columns(3)
121
-
122
- # Column 1: Total Count of Years
123
- with col1:
124
- st.subheader('Total Count of Years')
125
- st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{total_years}</p>", unsafe_allow_html=True)
126
-
127
- # Column 2: Total Movie Releases
128
- with col2:
129
- st.subheader('Total Movie Releases')
130
- st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{total_movies}</p>", unsafe_allow_html=True)
131
-
132
- # Column 3: Average Rating
133
- with col3:
134
- st.subheader('Average Rating')
135
- st.markdown(f"<p style='font-size:36px; font-weight:bold; text-align:center;'>{avg_rating:.2f}</p>", unsafe_allow_html=True)
136
-
137
- # Display word cloud of ge
 
1
  import sqlite3
2
  import pandas as pd
3
+ import plotly.express as px
4
  from wordcloud import WordCloud
5
  import matplotlib.pyplot as plt
6
+ from collections import Counter
7
+ import numpy as np
8
+ import streamlit as st
9
 
10
  # Function to load data from SQLite database
11
+ @st.cache(allow_output_mutation=True)
12
  def load_data(db_file):
13
  conn = sqlite3.connect(db_file)
14
  return conn
15
 
16
  # Function to fetch data from database based on query
17
+ @st.cache(allow_output_mutation=True)
18
  def fetch_data(conn, query):
19
  return pd.read_sql_query(query, conn)
20
 
21
+ # Function to fetch summary info from database
22
+ @st.cache(allow_output_mutation=True)
23
+ def fetch_summary_info(conn):
24
+ # Fetch total count of movies
25
+ query_total_movies = '''
26
+ SELECT COUNT(*) as total_movies
27
  FROM title_basics
28
+ WHERE titleType = 'movie'
 
 
29
  '''
30
+ total_movies = fetch_data(conn, query_total_movies).iloc[0]['total_movies']
 
31
 
32
+ # Fetch total count of years
33
+ query_total_years = '''
 
34
  SELECT COUNT(DISTINCT startYear) as total_years
35
  FROM title_basics
36
  WHERE titleType = 'movie' AND startYear IS NOT NULL AND startYear != '\\N'
37
  '''
38
+ total_years = fetch_data(conn, query_total_years).iloc[0]['total_years']
 
39
 
40
+ # Fetch average rating of movies
41
+ query_avg_rating = '''
 
42
  SELECT AVG(averageRating) as avg_rating
43
  FROM title_ratings
44
  '''
45
+ avg_rating = fetch_data(conn, query_avg_rating).iloc[0]['avg_rating']
 
46
 
47
+ return total_movies, total_years, avg_rating
48
+
49
+ # Function to plot global map of total films per region
50
+ @st.cache(allow_output_mutation=True)
51
+ def plot_global_map(df):
52
+ # Country code to name mapping (only a few examples shown for brevity)
53
+ country_mapping = {
54
+ 'AF': 'Afghanistan', 'AX': 'Åland Islands', 'AL': 'Albania', 'DZ': 'Algeria', 'AS': 'American Samoa',
55
+ # Add more mappings as per your original list
56
+ }
57
+
58
+ # Map country codes to country names
59
+ df['region'] = df['region'].map(country_mapping)
60
+
61
+ # Group by country and count the number of films
62
+ df_grouped = df.groupby('region').size().reset_index(name='total_films')
63
+
64
+ # Apply log transformation to handle outliers
65
+ df_grouped['log_total_films'] = np.log1p(df_grouped['total_films'])
66
+
67
+ # Create a choropleth map with the log-transformed data
68
+ fig = px.choropleth(df_grouped, locations='region', locationmode='country names',
69
+ color='log_total_films', hover_name='region',
70
+ color_continuous_scale='Plasma', # Change the color scheme here
71
+ labels={'log_total_films': 'Total Films (log scale)'})
72
+
73
+ # Update layout of the map
74
+ fig.update_layout(title='Total Films by Country (Log Scale)',
75
+ geo=dict(showframe=False, showcoastlines=False,
76
+ projection_type='equirectangular'))
77
+
78
+ return fig
79
+
80
+ # Function to create word cloud of genres
81
+ @st.cache(allow_output_mutation=True)
82
  def create_genre_wordcloud(conn):
83
  query = '''
84
  SELECT genres
 
86
  WHERE titleType = 'movie' AND genres IS NOT NULL AND genres != '\\N'
87
  '''
88
  df = fetch_data(conn, query)
89
+
90
+ # Process genres
91
+ genres = df['genres'].str.split(',', expand=True).stack().replace('\\N', pd.NA).dropna().reset_index(drop=True)
92
+ genre_counts = Counter(genres)
93
+
94
+ # Generate the word cloud
95
+ wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(genre_counts)
96
+
97
+ # Display the word cloud using Streamlit
98
+ st.image(wordcloud.to_array(), use_column_width=True)
99
 
100
+ # Save word cloud to a temporary file (optional)
101
+ # temp_file = '/tmp/wordcloud.png'
102
+ # plt.savefig(temp_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ # return temp_file
 
 
105
 
106
+ # Function to find best movie of each genre by numVotes * averageRating
107
+ @st.cache(allow_output_mutation=True)
108
+ def find_best_movies_by_genre(conn):
109
+ query = '''
110
+ SELECT tb.tconst, tb.primaryTitle, tb.startYear, tb.genres, tr.averageRating, tr.numVotes
111
+ FROM title_basics tb
112
+ JOIN title_ratings tr ON tb.tconst = tr.tconst
113
+ WHERE tb.titleType = 'movie' AND tb.genres IS NOT NULL AND tb.genres != '\\N'
114
+ '''
115
+ df = fetch_data(conn, query)
116
 
117
+ # Split genres and select the first genre for each movie
118
+ df['genre'] = df['genres'].str.split(',', expand=True)[0]
119
+
120
+ # Calculate score based on numVotes * averageRating
121
+ df['score'] = df['numVotes'] * df['averageRating']
122
+
123
+ # Get the best movie (highest score) for each genre
124
+ idx = df.groupby('genre')['score'].idxmax()
125
+ best_movies_by_genre = df.loc[idx, ['genre', 'primaryTitle', 'startYear', 'averageRating', 'numVotes', 'score']] \
126
+ .sort_values(by='score', ascending=False).reset_index(drop=True)
127
+
128
+ return best_movies_by_genre
129
+
130
+ # Main function to orchestrate the dashboard
131
+ def main():
132
+ st.title('IMDb Dashboard')
133
 
134
  # Load data from SQLite database
135
  db_file = 'imdb_data.db' # Adjust path as needed
136
  conn = load_data(db_file)
137
 
138
+ # Fetch and display summary info
139
+ total_movies, total_years, avg_rating = fetch_summary_info(conn)
140
+ st.write(f"Total Movies: {total_movies}")
141
+ st.write(f"Total Years: {total_years}")
142
+ st.write(f"Average Rating: {avg_rating:.2f}")
143
 
144
+ # Display global map of total films per region
145
+ df_movie_region = pd.read_csv('movie_region.csv') # Replace with your actual CSV loading
146
+ fig = plot_global_map(df_movie_region)
147
+ st.plotly_chart(fig)
148
 
149
+ # Display word cloud of genres
150
+ create_genre_wordcloud(conn)
 
151
 
152
+ # Find and display the best movie of each genre
153
+ best_movies_by_genre = find_best_movies_by_genre(conn)
154
+ st.subheader("Best Movie of Each Genre:")
155
+ st.write(best_movies_by_genre)
156
 
157
  # Close database connection
158
  conn.close()
159
 
160
+ # Execute the main function
161
+ if __name__ == '__main__':
162
+ main()