Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,8 @@ from bs4 import BeautifulSoup
|
|
4 |
import pandas as pd
|
5 |
from google.oauth2.service_account import Credentials
|
6 |
import gspread
|
|
|
|
|
7 |
|
8 |
# Google Sheets credentials
|
9 |
SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
|
@@ -13,22 +15,23 @@ SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7
|
|
13 |
# Streamlit app
|
14 |
st.title("Booking.com 台南飯店資料爬取與分析")
|
15 |
st.sidebar.header("功能選擇")
|
16 |
-
mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "上傳至 Google Sheet"])
|
17 |
|
18 |
@st.cache_data
|
19 |
-
def scrape_booking_hotel(
|
20 |
url = "https://www.booking.com/searchresults.zh-tw.html"
|
21 |
headers = {
|
22 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
23 |
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
24 |
}
|
25 |
params = {
|
26 |
-
'ss':
|
27 |
-
'checkin':
|
28 |
-
'checkout':
|
29 |
'group_adults': '2',
|
30 |
'no_rooms': '1',
|
31 |
'group_children': '0',
|
|
|
32 |
'dest_type': 'city'
|
33 |
}
|
34 |
try:
|
@@ -45,7 +48,7 @@ def scrape_booking_hotel(location, checkin_date, checkout_date):
|
|
45 |
price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
|
46 |
price = price_elem.text.strip() if price_elem else "無資料"
|
47 |
|
48 |
-
#
|
49 |
price = (
|
50 |
price.replace('TWD', '')
|
51 |
.replace(' ', '')
|
@@ -86,6 +89,47 @@ def scrape_booking_hotel(location, checkin_date, checkout_date):
|
|
86 |
except requests.RequestException:
|
87 |
return pd.DataFrame()
|
88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
def upload_to_google_sheets(df):
|
90 |
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
|
91 |
gs = gspread.authorize(creds)
|
@@ -97,25 +141,41 @@ def upload_to_google_sheets(df):
|
|
97 |
|
98 |
# Streamlit app implementation
|
99 |
if mode == "資料爬取":
|
100 |
-
st.header("
|
101 |
-
location = st.text_input("輸入查詢地點", "台南")
|
102 |
-
checkin_date = st.date_input("選擇入住日期")
|
103 |
-
checkout_date = st.date_input("選擇退房日期")
|
104 |
-
|
105 |
if st.button("開始爬取"):
|
106 |
-
df = scrape_booking_hotel(
|
107 |
if not df.empty:
|
108 |
st.dataframe(df)
|
109 |
-
df.to_csv('
|
110 |
-
st.success("資料爬取成功,已儲存至
|
111 |
else:
|
112 |
st.error("未能成功爬取資料")
|
113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
elif mode == "上傳至 Google Sheet":
|
115 |
st.header("上傳資料至 Google Sheet")
|
116 |
try:
|
117 |
-
df = pd.read_csv('
|
118 |
result = upload_to_google_sheets(df)
|
119 |
st.success(result)
|
120 |
except Exception as e:
|
121 |
-
st.error(f"上傳資料時發生錯誤:{e}")
|
|
|
4 |
import pandas as pd
|
5 |
from google.oauth2.service_account import Credentials
|
6 |
import gspread
|
7 |
+
import plotly.express as px
|
8 |
+
import plotly.graph_objects as go
|
9 |
|
10 |
# Google Sheets credentials
|
11 |
SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
|
|
|
15 |
# Streamlit app
|
16 |
st.title("Booking.com 台南飯店資料爬取與分析")
|
17 |
st.sidebar.header("功能選擇")
|
18 |
+
mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
|
19 |
|
20 |
@st.cache_data
|
21 |
+
def scrape_booking_hotel():
|
22 |
url = "https://www.booking.com/searchresults.zh-tw.html"
|
23 |
headers = {
|
24 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
25 |
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
|
26 |
}
|
27 |
params = {
|
28 |
+
'ss': '台南',
|
29 |
+
'checkin': '2024-11-16',
|
30 |
+
'checkout': '2024-11-17',
|
31 |
'group_adults': '2',
|
32 |
'no_rooms': '1',
|
33 |
'group_children': '0',
|
34 |
+
'dest_id': '-2637868',
|
35 |
'dest_type': 'city'
|
36 |
}
|
37 |
try:
|
|
|
48 |
price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
|
49 |
price = price_elem.text.strip() if price_elem else "無資料"
|
50 |
|
51 |
+
# 清洗價格資料
|
52 |
price = (
|
53 |
price.replace('TWD', '')
|
54 |
.replace(' ', '')
|
|
|
89 |
except requests.RequestException:
|
90 |
return pd.DataFrame()
|
91 |
|
92 |
+
def clean_rating(x):
|
93 |
+
if pd.isna(x) or x == '無評分':
|
94 |
+
return 0
|
95 |
+
return float(str(x).replace('分數', '').replace('分', ''))
|
96 |
+
|
97 |
+
def create_price_rating_scatter(df):
|
98 |
+
if df.empty:
|
99 |
+
st.warning("數據為空,無法生成圖表。")
|
100 |
+
return None
|
101 |
+
fig = px.scatter(
|
102 |
+
df,
|
103 |
+
x='價格',
|
104 |
+
y='評分',
|
105 |
+
text='飯店名稱',
|
106 |
+
size='價格', # 點大小基於價格
|
107 |
+
color='評分',
|
108 |
+
title='台南飯店價格與評分關係圖',
|
109 |
+
labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
|
110 |
+
)
|
111 |
+
fig.update_layout(height=600, title_x=0.5)
|
112 |
+
return fig
|
113 |
+
|
114 |
+
def create_price_distribution(df):
|
115 |
+
if df.empty:
|
116 |
+
st.warning("數據為空,無法生成圖表。")
|
117 |
+
return None
|
118 |
+
fig = go.Figure()
|
119 |
+
fig.add_trace(go.Histogram(
|
120 |
+
x=df['價格'],
|
121 |
+
name='價格分布',
|
122 |
+
nbinsx=10,
|
123 |
+
marker_color='rgb(55, 83, 109)'
|
124 |
+
))
|
125 |
+
fig.add_trace(go.Box(
|
126 |
+
x=df['價格'],
|
127 |
+
name='價格箱型圖',
|
128 |
+
marker_color='rgb(26, 118, 255)'
|
129 |
+
))
|
130 |
+
fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
|
131 |
+
return fig
|
132 |
+
|
133 |
def upload_to_google_sheets(df):
|
134 |
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
|
135 |
gs = gspread.authorize(creds)
|
|
|
141 |
|
142 |
# Streamlit app implementation
|
143 |
if mode == "資料爬取":
|
144 |
+
st.header("爬取台南飯店資料")
|
|
|
|
|
|
|
|
|
145 |
if st.button("開始爬取"):
|
146 |
+
df = scrape_booking_hotel()
|
147 |
if not df.empty:
|
148 |
st.dataframe(df)
|
149 |
+
df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
|
150 |
+
st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
|
151 |
else:
|
152 |
st.error("未能成功爬取資料")
|
153 |
|
154 |
+
elif mode == "資料視覺化":
|
155 |
+
st.header("分析與視覺化")
|
156 |
+
try:
|
157 |
+
df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
|
158 |
+
df['價格'] = pd.to_numeric(df['價格'], errors='coerce')
|
159 |
+
df['評分'] = df['評分'].apply(clean_rating)
|
160 |
+
df = df.dropna(subset=['價格'])
|
161 |
+
|
162 |
+
st.write(f"有效數據行數:{len(df)}")
|
163 |
+
st.write(f"價格缺失值數量:{df['價格'].isna().sum()}")
|
164 |
+
|
165 |
+
scatter_fig = create_price_rating_scatter(df)
|
166 |
+
if scatter_fig:
|
167 |
+
st.plotly_chart(scatter_fig)
|
168 |
+
dist_fig = create_price_distribution(df)
|
169 |
+
if dist_fig:
|
170 |
+
st.plotly_chart(dist_fig)
|
171 |
+
except Exception as e:
|
172 |
+
st.error(f"讀取或分析資料時發生錯誤:{e}")
|
173 |
+
|
174 |
elif mode == "上傳至 Google Sheet":
|
175 |
st.header("上傳資料至 Google Sheet")
|
176 |
try:
|
177 |
+
df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
|
178 |
result = upload_to_google_sheets(df)
|
179 |
st.success(result)
|
180 |
except Exception as e:
|
181 |
+
st.error(f"上傳資料時發生錯誤:{e}")
|