Spaces:
Sleeping
Sleeping
File size: 6,880 Bytes
f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c dd5d47e 65e4f78 dd5d47e f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c dd5d47e f39d20c 65e4f78 f39d20c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# file_path: app.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.oauth2.service_account import Credentials
import gspread
import plotly.express as px
import plotly.graph_objects as go
# Google Sheets credentials
SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
# Streamlit app
st.title("Booking.com 台南飯店資料爬取與分析")
st.sidebar.header("功能選擇")
mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
@st.cache_data
def scrape_booking_hotel():
url = "https://www.booking.com/searchresults.zh-tw.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
}
params = {
'ss': '台南',
'checkin': '2024-11-16',
'checkout': '2024-11-17',
'group_adults': '2',
'no_rooms': '1',
'group_children': '0',
'dest_id': '-2637868',
'dest_type': 'city'
}
try:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
hotels_data = []
hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
for hotel in hotel_cards:
try:
name_elem = hotel.find('div', {'data-testid': 'title', 'class': 'f6431b446c'})
name = name_elem.text.strip() if name_elem else "無資料"
price_elem = hotel.find('span', {
'data-testid': 'price-and-discounted-price',
'class': 'f6431b446c'
})
price = price_elem.text.strip() if price_elem else "無資料"
price = price.replace('TWD', '').replace(' ', '').replace(',', '').strip()
rating_container = hotel.find('div', {'class': 'a3b8729ab1'})
rating_elem = rating_container.find('div', {'class': 'ac4a7896c7'}) if rating_container else None
rating = rating_elem.text.strip() if rating_elem else "無評分"
description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
if description_elem:
room_type = description_elem.find('h4', {'class': 'abf093bdfe'})
room_type = room_type.text.strip() if room_type else ""
bed_info = description_elem.find('div', {'class': 'abf093bdfe'})
bed_info = bed_info.text.strip() if bed_info else ""
cancellation = description_elem.find('strong', text='可免費取消')
cancellation = "可免費取消" if cancellation else ""
payment = description_elem.find('strong', text='無需訂金')
payment = "無需訂金" if payment else ""
description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
else:
description = "無說明"
hotels_data.append({
'飯店名稱': name,
'價格': price,
'評分': rating,
'說明': description
})
except AttributeError:
continue
df = pd.DataFrame(hotels_data).drop_duplicates()
return df
except requests.RequestException:
return pd.DataFrame()
def clean_rating(x):
if pd.isna(x) or x == '無評分':
return 0
return float(str(x).replace('分數', '').replace('分', ''))
def create_price_rating_scatter(df):
if df.empty:
st.warning("沒有可視化的有效數據。請檢查輸入資料。")
return None # 無數據時返回空
fig = px.scatter(
df,
x='價格',
y='評分',
text='飯店名稱',
size='價格', # 點的大小根據價格變化
color='評分',
title='台南飯店價格與評分關係圖',
labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
)
fig.update_layout(height=600, title_x=0.5)
return fig
def create_price_distribution(df):
fig = go.Figure()
fig.add_trace(go.Histogram(
x=df['價格'],
name='價格分布',
nbinsx=10,
marker_color='rgb(55, 83, 109)'
))
fig.add_trace(go.Box(
x=df['價格'],
name='價格箱型圖',
marker_color='rgb(26, 118, 255)'
))
fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
return fig
def upload_to_google_sheets(df):
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
gs = gspread.authorize(creds)
sheet = gs.open_by_url(SPREADSHEET_URL)
worksheet = sheet.get_worksheet(0)
df1 = df.astype(str)
worksheet.update([df1.columns.values.tolist()] + df1.values.tolist())
return "資料已成功上傳到 Google Sheet!"
if mode == "資料爬取":
st.header("爬取台南飯店資料")
if st.button("開始爬取"):
df = scrape_booking_hotel()
if not df.empty:
st.dataframe(df)
df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
else:
st.error("未能成功爬取資料")
elif mode == "資料視覺化":
st.header("分析與視覺化")
try:
df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
# 清理數據
df['價格'] = pd.to_numeric(df['價格'], errors='coerce') # 無效值轉為 NaN
df['評分'] = df['評分'].apply(clean_rating) # 清理評分數據
df = df.dropna(subset=['價格']) # 移除價格為 NaN 的行
# 顯示數據摘要
st.write(f"有效數據行數:{len(df)}")
st.write(f"價格缺失值數量:{df['價格'].isna().sum()}")
# 繪製圖表
st.plotly_chart(create_price_rating_scatter(df))
st.plotly_chart(create_price_distribution(df))
except Exception as e:
st.error(f"讀取或分析資料時發生錯誤:{e}")
elif mode == "上傳至 Google Sheet":
st.header("上傳資料至 Google Sheet")
try:
df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
result = upload_to_google_sheets(df)
st.success(result)
except Exception as e:
st.error(f"上傳資料時發生錯誤:{e}")
|