Rooobert commited on
Commit
e1af865
·
verified ·
1 Parent(s): 083a08a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -76
app.py CHANGED
@@ -4,8 +4,6 @@ from bs4 import BeautifulSoup
4
  import pandas as pd
5
  from google.oauth2.service_account import Credentials
6
  import gspread
7
- import plotly.express as px
8
- import plotly.graph_objects as go
9
 
10
  # Google Sheets credentials
11
  SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
@@ -15,23 +13,22 @@ SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7
15
  # Streamlit app
16
  st.title("Booking.com 台南飯店資料爬取與分析")
17
  st.sidebar.header("功能選擇")
18
- mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
19
 
20
  @st.cache_data
21
- def scrape_booking_hotel():
22
  url = "https://www.booking.com/searchresults.zh-tw.html"
23
  headers = {
24
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
25
  'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
26
  }
27
  params = {
28
- 'ss': '台南',
29
- 'checkin': '2024-11-16',
30
- 'checkout': '2024-11-17',
31
  'group_adults': '2',
32
  'no_rooms': '1',
33
  'group_children': '0',
34
- 'dest_id': '-2637868',
35
  'dest_type': 'city'
36
  }
37
  try:
@@ -48,7 +45,7 @@ def scrape_booking_hotel():
48
  price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
49
  price = price_elem.text.strip() if price_elem else "無資料"
50
 
51
- # 清洗價格資料
52
  price = (
53
  price.replace('TWD', '')
54
  .replace(' ', '')
@@ -89,47 +86,6 @@ def scrape_booking_hotel():
89
  except requests.RequestException:
90
  return pd.DataFrame()
91
 
92
- def clean_rating(x):
93
- if pd.isna(x) or x == '無評分':
94
- return 0
95
- return float(str(x).replace('分數', '').replace('分', ''))
96
-
97
- def create_price_rating_scatter(df):
98
- if df.empty:
99
- st.warning("數據為空,無法生成圖表。")
100
- return None
101
- fig = px.scatter(
102
- df,
103
- x='價格',
104
- y='評分',
105
- text='飯店名稱',
106
- size='價格', # 點大小基於價格
107
- color='評分',
108
- title='台南飯店價格與評分關係圖',
109
- labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
110
- )
111
- fig.update_layout(height=600, title_x=0.5)
112
- return fig
113
-
114
- def create_price_distribution(df):
115
- if df.empty:
116
- st.warning("數據為空,無法生成圖表。")
117
- return None
118
- fig = go.Figure()
119
- fig.add_trace(go.Histogram(
120
- x=df['價格'],
121
- name='價格分布',
122
- nbinsx=10,
123
- marker_color='rgb(55, 83, 109)'
124
- ))
125
- fig.add_trace(go.Box(
126
- x=df['價格'],
127
- name='價格箱型圖',
128
- marker_color='rgb(26, 118, 255)'
129
- ))
130
- fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
131
- return fig
132
-
133
  def upload_to_google_sheets(df):
134
  creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
135
  gs = gspread.authorize(creds)
@@ -141,41 +97,25 @@ def upload_to_google_sheets(df):
141
 
142
  # Streamlit app implementation
143
  if mode == "資料爬取":
144
- st.header("爬取台南飯店資料")
 
 
 
 
145
  if st.button("開始爬取"):
146
- df = scrape_booking_hotel()
147
  if not df.empty:
148
  st.dataframe(df)
149
- df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
150
- st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
151
  else:
152
  st.error("未能成功爬取資料")
153
 
154
- elif mode == "資料視覺化":
155
- st.header("分析與視覺化")
156
- try:
157
- df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
158
- df['價格'] = pd.to_numeric(df['價格'], errors='coerce')
159
- df['評分'] = df['評分'].apply(clean_rating)
160
- df = df.dropna(subset=['價格'])
161
-
162
- st.write(f"有效數據行數:{len(df)}")
163
- st.write(f"價格缺失值數量:{df['價格'].isna().sum()}")
164
-
165
- scatter_fig = create_price_rating_scatter(df)
166
- if scatter_fig:
167
- st.plotly_chart(scatter_fig)
168
- dist_fig = create_price_distribution(df)
169
- if dist_fig:
170
- st.plotly_chart(dist_fig)
171
- except Exception as e:
172
- st.error(f"讀取或分析資料時發生錯誤:{e}")
173
-
174
  elif mode == "上傳至 Google Sheet":
175
  st.header("上傳資料至 Google Sheet")
176
  try:
177
- df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
178
  result = upload_to_google_sheets(df)
179
  st.success(result)
180
  except Exception as e:
181
- st.error(f"上傳資料時發生錯誤:{e}")
 
4
  import pandas as pd
5
  from google.oauth2.service_account import Credentials
6
  import gspread
 
 
7
 
8
  # Google Sheets credentials
9
  SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
 
13
  # Streamlit app
14
  st.title("Booking.com 台南飯店資料爬取與分析")
15
  st.sidebar.header("功能選擇")
16
+ mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "上傳至 Google Sheet"])
17
 
18
  @st.cache_data
19
+ def scrape_booking_hotel(location, checkin_date, checkout_date):
20
  url = "https://www.booking.com/searchresults.zh-tw.html"
21
  headers = {
22
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
23
  'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
24
  }
25
  params = {
26
+ 'ss': location,
27
+ 'checkin': checkin_date,
28
+ 'checkout': checkout_date,
29
  'group_adults': '2',
30
  'no_rooms': '1',
31
  'group_children': '0',
 
32
  'dest_type': 'city'
33
  }
34
  try:
 
45
  price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
46
  price = price_elem.text.strip() if price_elem else "無資料"
47
 
48
+ # Clean price data
49
  price = (
50
  price.replace('TWD', '')
51
  .replace(' ', '')
 
86
  except requests.RequestException:
87
  return pd.DataFrame()
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  def upload_to_google_sheets(df):
90
  creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
91
  gs = gspread.authorize(creds)
 
97
 
98
  # Streamlit app implementation
99
  if mode == "資料爬取":
100
+ st.header("爬取飯店資料")
101
+ location = st.text_input("輸入查詢地點", "台南")
102
+ checkin_date = st.date_input("選擇入住日期")
103
+ checkout_date = st.date_input("選擇退房日期")
104
+
105
  if st.button("開始爬取"):
106
+ df = scrape_booking_hotel(location, checkin_date.strftime('%Y-%m-%d'), checkout_date.strftime('%Y-%m-%d'))
107
  if not df.empty:
108
  st.dataframe(df)
109
+ df.to_csv('booking_hotels.csv', index=False, encoding='utf-8-sig')
110
+ st.success("資料爬取成功,已儲存至 booking_hotels.csv")
111
  else:
112
  st.error("未能成功爬取資料")
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  elif mode == "上傳至 Google Sheet":
115
  st.header("上傳資料至 Google Sheet")
116
  try:
117
+ df = pd.read_csv('booking_hotels.csv', encoding='utf-8-sig')
118
  result = upload_to_google_sheets(df)
119
  st.success(result)
120
  except Exception as e:
121
+ st.error(f"上傳資料時發生錯誤:{e}")