Rooobert commited on
Commit
cd5f134
·
verified ·
1 Parent(s): e1af865

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -16
app.py CHANGED
@@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
4
  import pandas as pd
5
  from google.oauth2.service_account import Credentials
6
  import gspread
 
 
7
 
8
  # Google Sheets credentials
9
  SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
@@ -13,22 +15,23 @@ SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7
13
  # Streamlit app
14
  st.title("Booking.com 台南飯店資料爬取與分析")
15
  st.sidebar.header("功能選擇")
16
- mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "上傳至 Google Sheet"])
17
 
18
  @st.cache_data
19
- def scrape_booking_hotel(location, checkin_date, checkout_date):
20
  url = "https://www.booking.com/searchresults.zh-tw.html"
21
  headers = {
22
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
23
  'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
24
  }
25
  params = {
26
- 'ss': location,
27
- 'checkin': checkin_date,
28
- 'checkout': checkout_date,
29
  'group_adults': '2',
30
  'no_rooms': '1',
31
  'group_children': '0',
 
32
  'dest_type': 'city'
33
  }
34
  try:
@@ -45,7 +48,7 @@ def scrape_booking_hotel(location, checkin_date, checkout_date):
45
  price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
46
  price = price_elem.text.strip() if price_elem else "無資料"
47
 
48
- # Clean price data
49
  price = (
50
  price.replace('TWD', '')
51
  .replace(' ', '')
@@ -86,6 +89,47 @@ def scrape_booking_hotel(location, checkin_date, checkout_date):
86
  except requests.RequestException:
87
  return pd.DataFrame()
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def upload_to_google_sheets(df):
90
  creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
91
  gs = gspread.authorize(creds)
@@ -97,25 +141,41 @@ def upload_to_google_sheets(df):
97
 
98
  # Streamlit app implementation
99
  if mode == "資料爬取":
100
- st.header("爬取飯店資料")
101
- location = st.text_input("輸入查詢地點", "台南")
102
- checkin_date = st.date_input("選擇入住日期")
103
- checkout_date = st.date_input("選擇退房日期")
104
-
105
  if st.button("開始爬取"):
106
- df = scrape_booking_hotel(location, checkin_date.strftime('%Y-%m-%d'), checkout_date.strftime('%Y-%m-%d'))
107
  if not df.empty:
108
  st.dataframe(df)
109
- df.to_csv('booking_hotels.csv', index=False, encoding='utf-8-sig')
110
- st.success("資料爬取成功,已儲存至 booking_hotels.csv")
111
  else:
112
  st.error("未能成功爬取資料")
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  elif mode == "上傳至 Google Sheet":
115
  st.header("上傳資料至 Google Sheet")
116
  try:
117
- df = pd.read_csv('booking_hotels.csv', encoding='utf-8-sig')
118
  result = upload_to_google_sheets(df)
119
  st.success(result)
120
  except Exception as e:
121
- st.error(f"上傳資料時發生錯誤:{e}")
 
4
  import pandas as pd
5
  from google.oauth2.service_account import Credentials
6
  import gspread
7
+ import plotly.express as px
8
+ import plotly.graph_objects as go
9
 
10
  # Google Sheets credentials
11
  SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
 
15
  # Streamlit app
16
  st.title("Booking.com 台南飯店資料爬取與分析")
17
  st.sidebar.header("功能選擇")
18
+ mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
19
 
20
  @st.cache_data
21
+ def scrape_booking_hotel():
22
  url = "https://www.booking.com/searchresults.zh-tw.html"
23
  headers = {
24
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
25
  'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
26
  }
27
  params = {
28
+ 'ss': '台南',
29
+ 'checkin': '2024-11-16',
30
+ 'checkout': '2024-11-17',
31
  'group_adults': '2',
32
  'no_rooms': '1',
33
  'group_children': '0',
34
+ 'dest_id': '-2637868',
35
  'dest_type': 'city'
36
  }
37
  try:
 
48
  price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
49
  price = price_elem.text.strip() if price_elem else "無資料"
50
 
51
+ # 清洗價格資料
52
  price = (
53
  price.replace('TWD', '')
54
  .replace(' ', '')
 
89
  except requests.RequestException:
90
  return pd.DataFrame()
91
 
92
+ def clean_rating(x):
93
+ if pd.isna(x) or x == '無評分':
94
+ return 0
95
+ return float(str(x).replace('分數', '').replace('分', ''))
96
+
97
+ def create_price_rating_scatter(df):
98
+ if df.empty:
99
+ st.warning("數據為空,無法生成圖表。")
100
+ return None
101
+ fig = px.scatter(
102
+ df,
103
+ x='價格',
104
+ y='評分',
105
+ text='飯店名稱',
106
+ size='價格', # 點大小基於價格
107
+ color='評分',
108
+ title='台南飯店價格與評分關係圖',
109
+ labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
110
+ )
111
+ fig.update_layout(height=600, title_x=0.5)
112
+ return fig
113
+
114
+ def create_price_distribution(df):
115
+ if df.empty:
116
+ st.warning("數據為空,無法生成圖表。")
117
+ return None
118
+ fig = go.Figure()
119
+ fig.add_trace(go.Histogram(
120
+ x=df['價格'],
121
+ name='價格分布',
122
+ nbinsx=10,
123
+ marker_color='rgb(55, 83, 109)'
124
+ ))
125
+ fig.add_trace(go.Box(
126
+ x=df['價格'],
127
+ name='價格箱型圖',
128
+ marker_color='rgb(26, 118, 255)'
129
+ ))
130
+ fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
131
+ return fig
132
+
133
  def upload_to_google_sheets(df):
134
  creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
135
  gs = gspread.authorize(creds)
 
141
 
142
  # Streamlit app implementation
143
  if mode == "資料爬取":
144
+ st.header("爬取台南飯店資料")
 
 
 
 
145
  if st.button("開始爬取"):
146
+ df = scrape_booking_hotel()
147
  if not df.empty:
148
  st.dataframe(df)
149
+ df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
150
+ st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
151
  else:
152
  st.error("未能成功爬取資料")
153
 
154
+ elif mode == "資料視覺化":
155
+ st.header("分析與視覺化")
156
+ try:
157
+ df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
158
+ df['價格'] = pd.to_numeric(df['價格'], errors='coerce')
159
+ df['評分'] = df['評分'].apply(clean_rating)
160
+ df = df.dropna(subset=['價格'])
161
+
162
+ st.write(f"有效數據行數:{len(df)}")
163
+ st.write(f"價格缺失值數量:{df['價格'].isna().sum()}")
164
+
165
+ scatter_fig = create_price_rating_scatter(df)
166
+ if scatter_fig:
167
+ st.plotly_chart(scatter_fig)
168
+ dist_fig = create_price_distribution(df)
169
+ if dist_fig:
170
+ st.plotly_chart(dist_fig)
171
+ except Exception as e:
172
+ st.error(f"讀取或分析資料時發生錯誤:{e}")
173
+
174
  elif mode == "上傳至 Google Sheet":
175
  st.header("上傳資料至 Google Sheet")
176
  try:
177
+ df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
178
  result = upload_to_google_sheets(df)
179
  st.success(result)
180
  except Exception as e:
181
+ st.error(f"上傳資料時發生錯誤:{e}")