Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,17 +2,18 @@ import streamlit as st
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
import pandas as pd
|
5 |
-
from google.oauth2.service_account import Credentials
|
6 |
-
import gspread
|
7 |
import plotly.express as px
|
8 |
import plotly.graph_objects as go
|
|
|
|
|
|
|
9 |
|
10 |
# Google Sheets credentials
|
11 |
SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
|
12 |
SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
|
13 |
SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
|
14 |
|
15 |
-
# Streamlit app
|
16 |
st.title("Booking.com 台南飯店資料爬取與分析")
|
17 |
st.sidebar.header("功能選擇")
|
18 |
mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
|
@@ -42,51 +43,25 @@ def scrape_booking_hotel():
|
|
42 |
hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
|
43 |
|
44 |
for hotel in hotel_cards:
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
rating_container = hotel.find('div', {'class': 'a3b8729ab1'})
|
61 |
-
rating_elem = rating_container.find('div', {'class': 'ac4a7896c7'}) if rating_container else None
|
62 |
-
rating = rating_elem.text.strip() if rating_elem else "無評分"
|
63 |
-
description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
|
64 |
-
|
65 |
-
if description_elem:
|
66 |
-
room_type = description_elem.find('h4')
|
67 |
-
room_type = room_type.text.strip() if room_type else ""
|
68 |
-
bed_info = description_elem.find('div')
|
69 |
-
bed_info = bed_info.text.strip() if bed_info else ""
|
70 |
-
cancellation = description_elem.find('strong', text='可免費取消')
|
71 |
-
cancellation = "可免費取消" if cancellation else ""
|
72 |
-
payment = description_elem.find('strong', text='無需訂金')
|
73 |
-
payment = "無需訂金" if payment else ""
|
74 |
-
description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
|
75 |
-
else:
|
76 |
-
description = "無說明"
|
77 |
-
|
78 |
-
hotels_data.append({
|
79 |
-
'飯店名稱': name,
|
80 |
-
'價格': price,
|
81 |
-
'評分': rating,
|
82 |
-
'說明': description
|
83 |
-
})
|
84 |
-
except AttributeError:
|
85 |
-
continue
|
86 |
|
87 |
-
|
88 |
-
return
|
89 |
except requests.RequestException:
|
|
|
90 |
return pd.DataFrame()
|
91 |
|
92 |
def clean_rating(x):
|
@@ -94,51 +69,28 @@ def clean_rating(x):
|
|
94 |
return 0
|
95 |
return float(str(x).replace('分數', '').replace('分', ''))
|
96 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
def create_price_rating_scatter(df):
|
98 |
-
|
99 |
-
|
100 |
-
return None
|
101 |
-
fig = px.scatter(
|
102 |
-
df,
|
103 |
-
x='價格',
|
104 |
-
y='評分',
|
105 |
-
text='飯店名稱',
|
106 |
-
size='價格', # 點大小基於價格
|
107 |
-
color='評分',
|
108 |
-
title='台南飯店價格與評分關係圖',
|
109 |
-
labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
|
110 |
-
)
|
111 |
fig.update_layout(height=600, title_x=0.5)
|
112 |
return fig
|
113 |
|
114 |
def create_price_distribution(df):
|
115 |
-
if df.empty:
|
116 |
-
st.warning("數據為空,無法生成圖表。")
|
117 |
-
return None
|
118 |
fig = go.Figure()
|
119 |
-
fig.add_trace(go.Histogram(
|
120 |
-
|
121 |
-
name='價格分布',
|
122 |
-
nbinsx=10,
|
123 |
-
marker_color='rgb(55, 83, 109)'
|
124 |
-
))
|
125 |
-
fig.add_trace(go.Box(
|
126 |
-
x=df['價格'],
|
127 |
-
name='價格箱型圖',
|
128 |
-
marker_color='rgb(26, 118, 255)'
|
129 |
-
))
|
130 |
fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
|
131 |
return fig
|
132 |
|
133 |
-
def upload_to_google_sheets(df):
|
134 |
-
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
|
135 |
-
gs = gspread.authorize(creds)
|
136 |
-
sheet = gs.open_by_url(SPREADSHEET_URL)
|
137 |
-
worksheet = sheet.get_worksheet(0)
|
138 |
-
df1 = df.astype(str)
|
139 |
-
worksheet.update([df1.columns.values.tolist()] + df1.values.tolist())
|
140 |
-
return "資料已成功上傳到 Google Sheet!"
|
141 |
-
|
142 |
# Streamlit app implementation
|
143 |
if mode == "資料爬取":
|
144 |
st.header("爬取台南飯店資料")
|
@@ -159,15 +111,10 @@ elif mode == "資料視覺化":
|
|
159 |
df['評分'] = df['評分'].apply(clean_rating)
|
160 |
df = df.dropna(subset=['價格'])
|
161 |
|
162 |
-
st.write(f"有效數據行數:{len(df)}")
|
163 |
-
st.write(f"價格缺失值數量:{df['價格'].isna().sum()}")
|
164 |
-
|
165 |
scatter_fig = create_price_rating_scatter(df)
|
166 |
-
|
167 |
-
st.plotly_chart(scatter_fig)
|
168 |
dist_fig = create_price_distribution(df)
|
169 |
-
|
170 |
-
st.plotly_chart(dist_fig)
|
171 |
except Exception as e:
|
172 |
st.error(f"讀取或分析資料時發生錯誤:{e}")
|
173 |
|
@@ -178,4 +125,4 @@ elif mode == "上傳至 Google Sheet":
|
|
178 |
result = upload_to_google_sheets(df)
|
179 |
st.success(result)
|
180 |
except Exception as e:
|
181 |
-
st.error(f"上傳資料時發生錯誤:{e}")
|
|
|
2 |
import requests
|
3 |
from bs4 import BeautifulSoup
|
4 |
import pandas as pd
|
|
|
|
|
5 |
import plotly.express as px
|
6 |
import plotly.graph_objects as go
|
7 |
+
from plotly.subplots import make_subplots
|
8 |
+
from google.oauth2.service_account import Credentials
|
9 |
+
import gspread
|
10 |
|
11 |
# Google Sheets credentials
|
12 |
SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
|
13 |
SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
|
14 |
SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
|
15 |
|
16 |
+
# Streamlit app setup
|
17 |
st.title("Booking.com 台南飯店資料爬取與分析")
|
18 |
st.sidebar.header("功能選擇")
|
19 |
mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
|
|
|
43 |
hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
|
44 |
|
45 |
for hotel in hotel_cards:
|
46 |
+
name = hotel.find('div', {'data-testid': 'title'}).text.strip() if hotel.find('div', {'data-testid': 'title'}) else "無資料"
|
47 |
+
price_elem = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
|
48 |
+
price = price_elem.text.strip().replace('TWD', '').replace(',', '').strip() if price_elem else "無資料"
|
49 |
+
rating_elem = hotel.find('div', {'class': 'a3b8729ab1'})
|
50 |
+
rating = rating_elem.text.strip() if rating_elem else "無評分"
|
51 |
+
description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
|
52 |
+
if description_elem:
|
53 |
+
room_type = description_elem.find('h4').text.strip() if description_elem.find('h4') else ""
|
54 |
+
bed_info = description_elem.find('div').text.strip() if description_elem.find('div') else ""
|
55 |
+
cancellation = "可免費取消" if description_elem.find('strong', text='可免費取消') else ""
|
56 |
+
payment = "無需訂金" if description_elem.find('strong', text='無需訂金') else ""
|
57 |
+
description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
|
58 |
+
else:
|
59 |
+
description = "無說明"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
+
hotels_data.append({'飯店名稱': name, '價格': price, '評分': rating, '說明': description})
|
62 |
+
return pd.DataFrame(hotels_data).drop_duplicates()
|
63 |
except requests.RequestException:
|
64 |
+
st.error("無法從網站獲取資料")
|
65 |
return pd.DataFrame()
|
66 |
|
67 |
def clean_rating(x):
|
|
|
69 |
return 0
|
70 |
return float(str(x).replace('分數', '').replace('分', ''))
|
71 |
|
72 |
+
def upload_to_google_sheets(df):
|
73 |
+
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
|
74 |
+
gs = gspread.authorize(creds)
|
75 |
+
sheet = gs.open_by_url(SPREADSHEET_URL)
|
76 |
+
worksheet = sheet.get_worksheet(0)
|
77 |
+
worksheet.update([df.columns.values.tolist()] + df.astype(str).values.tolist())
|
78 |
+
return "資料已成功上傳到 Google Sheet!"
|
79 |
+
|
80 |
+
# Visualization functions
|
81 |
def create_price_rating_scatter(df):
|
82 |
+
fig = px.scatter(df, x='價格', y='評分', text='飯店名稱', size='價格', color='評分',
|
83 |
+
title='台南飯店價格與評分關係圖', labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
fig.update_layout(height=600, title_x=0.5)
|
85 |
return fig
|
86 |
|
87 |
def create_price_distribution(df):
|
|
|
|
|
|
|
88 |
fig = go.Figure()
|
89 |
+
fig.add_trace(go.Histogram(x=df['價格'], name='價格分布', nbinsx=10, marker_color='rgb(55, 83, 109)'))
|
90 |
+
fig.add_trace(go.Box(x=df['價格'], name='價格箱型圖', marker_color='rgb(26, 118, 255)'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
|
92 |
return fig
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
# Streamlit app implementation
|
95 |
if mode == "資料爬取":
|
96 |
st.header("爬取台南飯店資料")
|
|
|
111 |
df['評分'] = df['評分'].apply(clean_rating)
|
112 |
df = df.dropna(subset=['價格'])
|
113 |
|
|
|
|
|
|
|
114 |
scatter_fig = create_price_rating_scatter(df)
|
115 |
+
st.plotly_chart(scatter_fig)
|
|
|
116 |
dist_fig = create_price_distribution(df)
|
117 |
+
st.plotly_chart(dist_fig)
|
|
|
118 |
except Exception as e:
|
119 |
st.error(f"讀取或分析資料時發生錯誤:{e}")
|
120 |
|
|
|
125 |
result = upload_to_google_sheets(df)
|
126 |
st.success(result)
|
127 |
except Exception as e:
|
128 |
+
st.error(f"上傳資料時發生錯誤:{e}")
|