Rooobert commited on
Commit
974dc1d
·
verified ·
1 Parent(s): bb81aa7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -82
app.py CHANGED
@@ -8,17 +8,6 @@ from plotly.subplots import make_subplots
8
  from google.oauth2.service_account import Credentials
9
  import gspread
10
 
11
- # Google Sheets credentials
12
- SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
13
- SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
14
- SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
15
-
16
- # Streamlit app setup
17
- st.title("Booking.com 台南飯店資料爬取與分析")
18
- st.sidebar.header("功能選擇")
19
- mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
20
-
21
- @st.cache_data
22
  def scrape_booking_hotel():
23
  url = "https://www.booking.com/searchresults.zh-tw.html"
24
  headers = {
@@ -35,94 +24,126 @@ def scrape_booking_hotel():
35
  'dest_id': '-2637868',
36
  'dest_type': 'city'
37
  }
 
38
  try:
39
  response = requests.get(url, headers=headers, params=params)
40
  response.raise_for_status()
41
  soup = BeautifulSoup(response.text, 'html.parser')
 
42
  hotels_data = []
43
  hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
44
 
45
  for hotel in hotel_cards:
46
- name = hotel.find('div', {'data-testid': 'title'}).text.strip() if hotel.find('div', {'data-testid': 'title'}) else "無資料"
47
- price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
48
- price = price_elem.text.strip().replace('TWD', '').replace(',', '').strip() if price_elem else "無資料"
49
- rating_elem = hotel.find('div', {'class': 'a3b8729ab1'})
50
- rating = rating_elem.text.strip() if rating_elem else "無評分"
51
- description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
52
- if description_elem:
53
- room_type = description_elem.find('h4').text.strip() if description_elem.find('h4') else ""
54
- bed_info = description_elem.find('div').text.strip() if description_elem.find('div') else ""
55
- cancellation = "可免費取消" if description_elem.find('strong', text='可免費取消') else ""
56
- payment = "無需訂金" if description_elem.find('strong', text='無需訂金') else ""
57
- description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
58
- else:
59
- description = "無說明"
60
-
61
- hotels_data.append({'飯店名稱': name, '價格': price, '評分': rating, '說明': description})
62
- return pd.DataFrame(hotels_data).drop_duplicates()
63
- except requests.RequestException:
64
- st.error("無法從網站獲取資料")
65
- return pd.DataFrame()
66
 
67
- def clean_rating(x):
68
- if pd.isna(x) or x == '無評分':
69
- return 0
70
- return float(str(x).replace('分數', '').replace('分', ''))
71
 
72
- def upload_to_google_sheets(df):
73
- creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
74
- gs = gspread.authorize(creds)
75
- sheet = gs.open_by_url(SPREADSHEET_URL)
76
- worksheet = sheet.get_worksheet(0)
77
- worksheet.update([df.columns.values.tolist()] + df.astype(str).values.tolist())
78
- return "資料已成功上傳到 Google Sheet!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Visualization functions
81
  def create_price_rating_scatter(df):
82
- fig = px.scatter(df, x='價格', y='評分', text='飯店名稱', size='價格', color='評分',
83
- title='台南飯店價格與評分關係圖', labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'})
84
- fig.update_layout(height=600, title_x=0.5)
 
 
 
 
 
 
 
 
 
85
  return fig
86
 
87
  def create_price_distribution(df):
88
  fig = go.Figure()
89
  fig.add_trace(go.Histogram(x=df['價格'], name='價格分布', nbinsx=10, marker_color='rgb(55, 83, 109)'))
90
  fig.add_trace(go.Box(x=df['價格'], name='價格箱型圖', marker_color='rgb(26, 118, 255)'))
91
- fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
92
  return fig
93
 
94
- # Streamlit app implementation
95
- if mode == "資料爬取":
96
- st.header("爬取台南飯店資料")
97
- if st.button("開始爬取"):
98
- df = scrape_booking_hotel()
99
- if not df.empty:
100
- st.dataframe(df)
101
- df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
102
- st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
103
- else:
104
- st.error("未能成功爬取資料")
105
-
106
- elif mode == "資料視覺化":
107
- st.header("分析與視覺化")
108
- try:
109
- df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
110
- df['價格'] = pd.to_numeric(df['價格'], errors='coerce')
111
- df['評分'] = df['評分'].apply(clean_rating)
112
- df = df.dropna(subset=['價格'])
113
-
114
- scatter_fig = create_price_rating_scatter(df)
115
- st.plotly_chart(scatter_fig)
116
- dist_fig = create_price_distribution(df)
117
- st.plotly_chart(dist_fig)
118
- except Exception as e:
119
- st.error(f"讀取或分析資料時發生錯誤:{e}")
120
-
121
- elif mode == "上傳至 Google Sheet":
122
- st.header("上傳資料至 Google Sheet")
123
- try:
124
- df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
125
- result = upload_to_google_sheets(df)
126
- st.success(result)
127
- except Exception as e:
128
- st.error(f"上傳資料時發生錯誤:{e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from google.oauth2.service_account import Credentials
9
  import gspread
10
 
 
 
 
 
 
 
 
 
 
 
 
11
  def scrape_booking_hotel():
12
  url = "https://www.booking.com/searchresults.zh-tw.html"
13
  headers = {
 
24
  'dest_id': '-2637868',
25
  'dest_type': 'city'
26
  }
27
+
28
  try:
29
  response = requests.get(url, headers=headers, params=params)
30
  response.raise_for_status()
31
  soup = BeautifulSoup(response.text, 'html.parser')
32
+
33
  hotels_data = []
34
  hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
35
 
36
  for hotel in hotel_cards:
37
+ try:
38
+ name = hotel.find('div', {'data-testid': 'title', 'class': 'f6431b446c'}).text.strip() or "無資料"
39
+ price = hotel.find('span', {'data-testid': 'price-and-discounted-price', 'class': 'f6431b446c'}).text.strip() or "無資料"
40
+ price = price.replace('TWD', '').replace(' ', '').replace(',', '').strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ rating_container = hotel.find('div', {'class': 'a3b8729ab1'})
43
+ rating = rating_container.find('div', {'class': 'ac4a7896c7'}).text.strip() if rating_container else "無評分"
 
 
44
 
45
+ description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
46
+ if description_elem:
47
+ room_type = description_elem.find('h4', {'class': 'abf093bdfe'}).text.strip() if description_elem.find('h4', {'class': 'abf093bdfe'}) else ""
48
+ bed_info = description_elem.find('div', {'class': 'abf093bdfe'}).text.strip() if description_elem.find('div', {'class': 'abf093bdfe'}) else ""
49
+ cancellation = "可免費取消" if description_elem.find('strong', text='可免費取消') else ""
50
+ payment = "無需訂金" if description_elem.find('strong', text='無需訂金') else ""
51
+ description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
52
+ else:
53
+ description = "無說明"
54
+
55
+ hotels_data.append({
56
+ '飯店名稱': name,
57
+ '價格': price,
58
+ '評分': rating,
59
+ '說明': description
60
+ })
61
+
62
+ except AttributeError as e:
63
+ print(f"解析飯店資訊時發生錯誤: {e}")
64
+ continue
65
+
66
+ df = pd.DataFrame(hotels_data)
67
+ df = df.drop_duplicates()
68
+ return df
69
+
70
+ except requests.RequestException as e:
71
+ print(f"請求發生錯誤: {e}")
72
+ return pd.DataFrame()
73
 
 
74
  def create_price_rating_scatter(df):
75
+ fig = px.scatter(
76
+ df,
77
+ x='價格',
78
+ y='評分',
79
+ text='飯店名稱',
80
+ size='價格',
81
+ color='評分',
82
+ title='台南飯店價格與評分關係圖',
83
+ labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
84
+ )
85
+ fig.update_traces(textposition='top center', marker=dict(sizeref=2.*max(df['價格'])/(40.**2)))
86
+ fig.update_layout(height=600, showlegend=True, title_x=0.5, title_font_size=20)
87
  return fig
88
 
89
  def create_price_distribution(df):
90
  fig = go.Figure()
91
  fig.add_trace(go.Histogram(x=df['價格'], name='價格分布', nbinsx=10, marker_color='rgb(55, 83, 109)'))
92
  fig.add_trace(go.Box(x=df['價格'], name='價格箱型圖', marker_color='rgb(26, 118, 255)'))
93
+ fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, title_font_size=20, xaxis_title='價格 (TWD)', yaxis_title='數量', height=500, bargap=0.2, showlegend=True)
94
  return fig
95
 
96
+ def create_rating_box_by_price_range(df):
97
+ fig = px.box(df, x='價格區間', y='評分', title='不同價格區間的評分分布', labels={'價格區間': '價格類型', '評分': '評分 (0-10)'}, color='價格區間')
98
+ fig.update_layout(title_x=0.5, title_font_size=20, height=500, showlegend=False)
99
+ return fig
100
+
101
+ def create_hotel_comparison(df):
102
+ fig = make_subplots(specs=[[{"secondary_y": True}]])
103
+ df_sorted = df.sort_values('評分', ascending=True)
104
+ fig.add_trace(go.Bar(x=df_sorted['飯店名稱'], y=df_sorted['評分'], name="評分", marker_color='rgb(55, 83, 109)'))
105
+ fig.add_trace(go.Scatter(x=df_sorted['飯店名稱'], y=df_sorted['價格'], name="價格", marker_color='rgb(26, 118, 255)'), secondary_y=True)
106
+ fig.update_layout(title_text='台南飯店評分與價格比較', title_x=0.5, title_font_size=20, height=700, showlegend=True, xaxis_tickangle=45)
107
+ fig.update_yaxes(title_text="評分", secondary_y=False)
108
+ fig.update_yaxes(title_text="價格 (TWD)", secondary_y=True)
109
+ return fig
110
+
111
+ def update_google_sheet(df):
112
+ scope = ['https://www.googleapis.com/auth/spreadsheets']
113
+ creds = Credentials.from_service_account_file("realtime-441511-f5708eabdf26.json", scopes=scope)
114
+ gs = gspread.authorize(creds)
115
+ sheet = gs.open_by_url('https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0')
116
+ worksheet = sheet.get_worksheet(0)
117
+ worksheet.update([df.columns.values.tolist()] + df.astype(str).values.tolist())
118
+ st.success("Data updated to Google Sheet successfully!")
119
+
120
+ def main():
121
+ st.set_page_config(page_title="Booking.com Hotel Analysis")
122
+ st.title("Booking.com Hotel Analysis")
123
+
124
+ df = scrape_booking_hotel()
125
+
126
+ st.subheader("Hotel Data")
127
+ st.dataframe(df)
128
+
129
+ st.subheader("Price vs Rating Scatter Plot")
130
+ scatter_fig = create_price_rating_scatter(df)
131
+ st.plotly_chart(scatter_fig)
132
+
133
+ st.subheader("Price Distribution")
134
+ dist_fig = create_price_distribution(df)
135
+ st.plotly_chart(dist_fig)
136
+
137
+ st.subheader("Rating by Price Range")
138
+ box_fig = create_rating_box_by_price_range(df)
139
+ st.plotly_chart(box_fig)
140
+
141
+ st.subheader("Hotel Comparison")
142
+ comparison_fig = create_hotel_comparison(df)
143
+ st.plotly_chart(comparison_fig)
144
+
145
+ if st.button("Update Google Sheet"):
146
+ update_google_sheet(df)
147
+
148
+ if __name__ == "__main__":
149
+ main()