Spaces:
Sleeping
Sleeping
File size: 6,417 Bytes
f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c 65e4f78 f39d20c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
# file_path: app.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.oauth2.service_account import Credentials
import gspread
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Google Sheets credentials
SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
# Streamlit app
st.title("Booking.com 台南飯店資料爬取與分析")
st.sidebar.header("功能選擇")
mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
@st.cache_data
def scrape_booking_hotel():
url = "https://www.booking.com/searchresults.zh-tw.html"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
}
params = {
'ss': '台南',
'checkin': '2024-11-16',
'checkout': '2024-11-17',
'group_adults': '2',
'no_rooms': '1',
'group_children': '0',
'dest_id': '-2637868',
'dest_type': 'city'
}
try:
response = requests.get(url, headers=headers, params=params)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
hotels_data = []
hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
for hotel in hotel_cards:
try:
name_elem = hotel.find('div', {'data-testid': 'title', 'class': 'f6431b446c'})
name = name_elem.text.strip() if name_elem else "無資料"
price_elem = hotel.find('span', {
'data-testid': 'price-and-discounted-price',
'class': 'f6431b446c'
})
price = price_elem.text.strip() if price_elem else "無資料"
price = price.replace('TWD', '').replace(' ', '').replace(',', '').strip()
rating_container = hotel.find('div', {'class': 'a3b8729ab1'})
rating_elem = rating_container.find('div', {'class': 'ac4a7896c7'}) if rating_container else None
rating = rating_elem.text.strip() if rating_elem else "無評分"
description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
if description_elem:
room_type = description_elem.find('h4', {'class': 'abf093bdfe'})
room_type = room_type.text.strip() if room_type else ""
bed_info = description_elem.find('div', {'class': 'abf093bdfe'})
bed_info = bed_info.text.strip() if bed_info else ""
cancellation = description_elem.find('strong', text='可免費取消')
cancellation = "可免費取消" if cancellation else ""
payment = description_elem.find('strong', text='無需訂金')
payment = "無需訂金" if payment else ""
description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
else:
description = "無說明"
hotels_data.append({
'飯店名稱': name,
'價格': price,
'評分': rating,
'說明': description
})
except AttributeError:
continue
df = pd.DataFrame(hotels_data).drop_duplicates()
return df
except requests.RequestException:
return pd.DataFrame()
def clean_rating(x):
if pd.isna(x) or x == '無評分':
return 0
return float(str(x).replace('分數', '').replace('分', ''))
def create_price_rating_scatter(df):
fig = px.scatter(
df,
x='價格',
y='評分',
text='飯店名稱',
size='價格',
color='評分',
title='台南飯店價格與評分關係圖',
labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
)
fig.update_layout(height=600, title_x=0.5)
return fig
def create_price_distribution(df):
fig = go.Figure()
fig.add_trace(go.Histogram(
x=df['價格'],
name='價格分布',
nbinsx=10,
marker_color='rgb(55, 83, 109)'
))
fig.add_trace(go.Box(
x=df['價格'],
name='價格箱型圖',
marker_color='rgb(26, 118, 255)'
))
fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
return fig
def upload_to_google_sheets(df):
creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
gs = gspread.authorize(creds)
sheet = gs.open_by_url(SPREADSHEET_URL)
worksheet = sheet.get_worksheet(0)
df1 = df.astype(str)
worksheet.update([df1.columns.values.tolist()] + df1.values.tolist())
return "資料已成功上傳到 Google Sheet!"
if mode == "資料爬取":
st.header("爬取台南飯店資料")
if st.button("開始爬取"):
df = scrape_booking_hotel()
if not df.empty:
st.dataframe(df)
df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
else:
st.error("未能成功爬取資料")
elif mode == "資料視覺化":
st.header("分析與視覺化")
try:
df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
df['價格'] = pd.to_numeric(df['價格'], errors='coerce')
df['評分'] = df['評分'].apply(clean_rating)
st.plotly_chart(create_price_rating_scatter(df))
st.plotly_chart(create_price_distribution(df))
except Exception as e:
st.error(f"讀取或分析資料時發生錯誤:{e}")
elif mode == "上傳至 Google Sheet":
st.header("上傳資料至 Google Sheet")
try:
df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
result = upload_to_google_sheets(df)
st.success(result)
except Exception as e:
st.error(f"上傳資料時發生錯誤:{e}")
|