Rooobert commited on
Commit
bb81aa7
·
verified ·
1 Parent(s): cd5f134

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -90
app.py CHANGED
@@ -2,17 +2,18 @@ import streamlit as st
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import pandas as pd
5
- from google.oauth2.service_account import Credentials
6
- import gspread
7
  import plotly.express as px
8
  import plotly.graph_objects as go
 
 
 
9
 
10
  # Google Sheets credentials
11
  SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
12
  SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
13
  SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
14
 
15
- # Streamlit app
16
  st.title("Booking.com 台南飯店資料爬取與分析")
17
  st.sidebar.header("功能選擇")
18
  mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
@@ -42,51 +43,25 @@ def scrape_booking_hotel():
42
  hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
43
 
44
  for hotel in hotel_cards:
45
- try:
46
- name_elem = hotel.find('div', {'data-testid': 'title'})
47
- name = name_elem.text.strip() if name_elem else "無資料"
48
- price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
49
- price = price_elem.text.strip() if price_elem else "無資料"
50
-
51
- # 清洗價格資料
52
- price = (
53
- price.replace('TWD', '')
54
- .replace(' ', '')
55
- .replace(',', '')
56
- .strip()
57
- )
58
- price = int(price) if price.isdigit() else None
59
-
60
- rating_container = hotel.find('div', {'class': 'a3b8729ab1'})
61
- rating_elem = rating_container.find('div', {'class': 'ac4a7896c7'}) if rating_container else None
62
- rating = rating_elem.text.strip() if rating_elem else "無評分"
63
- description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
64
-
65
- if description_elem:
66
- room_type = description_elem.find('h4')
67
- room_type = room_type.text.strip() if room_type else ""
68
- bed_info = description_elem.find('div')
69
- bed_info = bed_info.text.strip() if bed_info else ""
70
- cancellation = description_elem.find('strong', text='可免費取消')
71
- cancellation = "可免費取消" if cancellation else ""
72
- payment = description_elem.find('strong', text='無需訂金')
73
- payment = "無需訂金" if payment else ""
74
- description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
75
- else:
76
- description = "無說明"
77
-
78
- hotels_data.append({
79
- '飯店名稱': name,
80
- '價格': price,
81
- '評分': rating,
82
- '說明': description
83
- })
84
- except AttributeError:
85
- continue
86
 
87
- df = pd.DataFrame(hotels_data).drop_duplicates()
88
- return df
89
  except requests.RequestException:
 
90
  return pd.DataFrame()
91
 
92
  def clean_rating(x):
@@ -94,51 +69,28 @@ def clean_rating(x):
94
  return 0
95
  return float(str(x).replace('分數', '').replace('分', ''))
96
 
 
 
 
 
 
 
 
 
 
97
  def create_price_rating_scatter(df):
98
- if df.empty:
99
- st.warning("數據為空,無法生成圖表。")
100
- return None
101
- fig = px.scatter(
102
- df,
103
- x='價格',
104
- y='評分',
105
- text='飯店名稱',
106
- size='價格', # 點大小基於價格
107
- color='評分',
108
- title='台南飯店價格與評分關係圖',
109
- labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
110
- )
111
  fig.update_layout(height=600, title_x=0.5)
112
  return fig
113
 
114
  def create_price_distribution(df):
115
- if df.empty:
116
- st.warning("數據為空,無法生成圖表。")
117
- return None
118
  fig = go.Figure()
119
- fig.add_trace(go.Histogram(
120
- x=df['價格'],
121
- name='價格分布',
122
- nbinsx=10,
123
- marker_color='rgb(55, 83, 109)'
124
- ))
125
- fig.add_trace(go.Box(
126
- x=df['價格'],
127
- name='價格箱型圖',
128
- marker_color='rgb(26, 118, 255)'
129
- ))
130
  fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
131
  return fig
132
 
133
- def upload_to_google_sheets(df):
134
- creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
135
- gs = gspread.authorize(creds)
136
- sheet = gs.open_by_url(SPREADSHEET_URL)
137
- worksheet = sheet.get_worksheet(0)
138
- df1 = df.astype(str)
139
- worksheet.update([df1.columns.values.tolist()] + df1.values.tolist())
140
- return "資料已成功上傳到 Google Sheet!"
141
-
142
  # Streamlit app implementation
143
  if mode == "資料爬取":
144
  st.header("爬取台南飯店資料")
@@ -159,15 +111,10 @@ elif mode == "資料視覺化":
159
  df['評分'] = df['評分'].apply(clean_rating)
160
  df = df.dropna(subset=['價格'])
161
 
162
- st.write(f"有效數據行數:{len(df)}")
163
- st.write(f"價格缺失值數量:{df['價格'].isna().sum()}")
164
-
165
  scatter_fig = create_price_rating_scatter(df)
166
- if scatter_fig:
167
- st.plotly_chart(scatter_fig)
168
  dist_fig = create_price_distribution(df)
169
- if dist_fig:
170
- st.plotly_chart(dist_fig)
171
  except Exception as e:
172
  st.error(f"讀取或分析資料時發生錯誤:{e}")
173
 
@@ -178,4 +125,4 @@ elif mode == "上傳至 Google Sheet":
178
  result = upload_to_google_sheets(df)
179
  st.success(result)
180
  except Exception as e:
181
- st.error(f"上傳資料時發生錯誤:{e}")
 
2
  import requests
3
  from bs4 import BeautifulSoup
4
  import pandas as pd
 
 
5
  import plotly.express as px
6
  import plotly.graph_objects as go
7
+ from plotly.subplots import make_subplots
8
+ from google.oauth2.service_account import Credentials
9
+ import gspread
10
 
11
  # Google Sheets credentials
12
  SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
13
  SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
14
  SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
15
 
16
+ # Streamlit app setup
17
  st.title("Booking.com 台南飯店資料爬取與分析")
18
  st.sidebar.header("功能選擇")
19
  mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
 
43
  hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
44
 
45
  for hotel in hotel_cards:
46
+ name = hotel.find('div', {'data-testid': 'title'}).text.strip() if hotel.find('div', {'data-testid': 'title'}) else "無資料"
47
+ price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
48
+ price = price_elem.text.strip().replace('TWD', '').replace(',', '').strip() if price_elem else "無資料"
49
+ rating_elem = hotel.find('div', {'class': 'a3b8729ab1'})
50
+ rating = rating_elem.text.strip() if rating_elem else "無評分"
51
+ description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
52
+ if description_elem:
53
+ room_type = description_elem.find('h4').text.strip() if description_elem.find('h4') else ""
54
+ bed_info = description_elem.find('div').text.strip() if description_elem.find('div') else ""
55
+ cancellation = "可免費取消" if description_elem.find('strong', text='可免費取消') else ""
56
+ payment = "無需訂金" if description_elem.find('strong', text='無需訂金') else ""
57
+ description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
58
+ else:
59
+ description = "無說明"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ hotels_data.append({'飯店名稱': name, '價格': price, '評分': rating, '說明': description})
62
+ return pd.DataFrame(hotels_data).drop_duplicates()
63
  except requests.RequestException:
64
+ st.error("無法從網站獲取資料")
65
  return pd.DataFrame()
66
 
67
  def clean_rating(x):
 
69
  return 0
70
  return float(str(x).replace('分數', '').replace('分', ''))
71
 
72
+ def upload_to_google_sheets(df):
73
+ creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
74
+ gs = gspread.authorize(creds)
75
+ sheet = gs.open_by_url(SPREADSHEET_URL)
76
+ worksheet = sheet.get_worksheet(0)
77
+ worksheet.update([df.columns.values.tolist()] + df.astype(str).values.tolist())
78
+ return "資料已成功上傳到 Google Sheet!"
79
+
80
+ # Visualization functions
81
  def create_price_rating_scatter(df):
82
+ fig = px.scatter(df, x='價格', y='評分', text='飯店名稱', size='價格', color='評分',
83
+ title='台南飯店價格與評分關係圖', labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'})
 
 
 
 
 
 
 
 
 
 
 
84
  fig.update_layout(height=600, title_x=0.5)
85
  return fig
86
 
87
  def create_price_distribution(df):
 
 
 
88
  fig = go.Figure()
89
+ fig.add_trace(go.Histogram(x=df['價格'], name='價格分布', nbinsx=10, marker_color='rgb(55, 83, 109)'))
90
+ fig.add_trace(go.Box(x=df['價格'], name='價格箱型圖', marker_color='rgb(26, 118, 255)'))
 
 
 
 
 
 
 
 
 
91
  fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
92
  return fig
93
 
 
 
 
 
 
 
 
 
 
94
  # Streamlit app implementation
95
  if mode == "資料爬取":
96
  st.header("爬取台南飯店資料")
 
111
  df['評分'] = df['評分'].apply(clean_rating)
112
  df = df.dropna(subset=['價格'])
113
 
 
 
 
114
  scatter_fig = create_price_rating_scatter(df)
115
+ st.plotly_chart(scatter_fig)
 
116
  dist_fig = create_price_distribution(df)
117
+ st.plotly_chart(dist_fig)
 
118
  except Exception as e:
119
  st.error(f"讀取或分析資料時發生錯誤:{e}")
120
 
 
125
  result = upload_to_google_sheets(df)
126
  st.success(result)
127
  except Exception as e:
128
+ st.error(f"上傳資料時發生錯誤:{e}")