# file_path: app.py import streamlit as st import requests from bs4 import BeautifulSoup import pandas as pd from google.oauth2.service_account import Credentials import gspread import plotly.express as px import plotly.graph_objects as go # Google Sheets credentials SCOPE = ['https://www.googleapis.com/auth/spreadsheets'] SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json" SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0" # Streamlit app st.title("Booking.com 台南飯店資料爬取與分析") st.sidebar.header("功能選擇") mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"]) @st.cache_data def scrape_booking_hotel(): url = "https://www.booking.com/searchresults.zh-tw.html" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7', } params = { 'ss': '台南', 'checkin': '2024-11-16', 'checkout': '2024-11-17', 'group_adults': '2', 'no_rooms': '1', 'group_children': '0', 'dest_id': '-2637868', 'dest_type': 'city' } try: response = requests.get(url, headers=headers, params=params) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') hotels_data = [] hotel_cards = soup.find_all('div', {'data-testid': 'property-card'}) for hotel in hotel_cards: try: name_elem = hotel.find('div', {'data-testid': 'title', 'class': 'f6431b446c'}) name = name_elem.text.strip() if name_elem else "無資料" price_elem = hotel.find('span', { 'data-testid': 'price-and-discounted-price', 'class': 'f6431b446c' }) price = price_elem.text.strip() if price_elem else "無資料" price = price.replace('TWD', '').replace(' ', '').replace(',', '').strip() rating_container = hotel.find('div', {'class': 'a3b8729ab1'}) rating_elem = rating_container.find('div', {'class': 'ac4a7896c7'}) if rating_container else None rating = rating_elem.text.strip() if rating_elem else "無評分" description_elem = hotel.find('div', {'data-testid': 'recommended-units'}) if description_elem: room_type = description_elem.find('h4', {'class': 'abf093bdfe'}) room_type = room_type.text.strip() if room_type else "" bed_info = description_elem.find('div', {'class': 'abf093bdfe'}) bed_info = bed_info.text.strip() if bed_info else "" cancellation = description_elem.find('strong', text='可免費取消') cancellation = "可免費取消" if cancellation else "" payment = description_elem.find('strong', text='無需訂金') payment = "無需訂金" if payment else "" description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |') else: description = "無說明" hotels_data.append({ '飯店名稱': name, '價格': price, '評分': rating, '說明': description }) except AttributeError: continue df = pd.DataFrame(hotels_data).drop_duplicates() return df except requests.RequestException: return pd.DataFrame() def clean_rating(x): if pd.isna(x) or x == '無評分': return 0 return float(str(x).replace('分數', '').replace('分', '')) def create_price_rating_scatter(df): if df.empty: st.warning("沒有可視化的有效數據。請檢查輸入資料。") return None # 無數據時返回空 fig = px.scatter( df, x='價格', y='評分', text='飯店名稱', size='價格', # 點的大小根據價格變化 color='評分', title='台南飯店價格與評分關係圖', labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'} ) fig.update_layout(height=600, title_x=0.5) return fig def create_price_distribution(df): fig = go.Figure() fig.add_trace(go.Histogram( x=df['價格'], name='價格分布', nbinsx=10, marker_color='rgb(55, 83, 109)' )) fig.add_trace(go.Box( x=df['價格'], name='價格箱型圖', marker_color='rgb(26, 118, 255)' )) fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500) return fig def upload_to_google_sheets(df): creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE) gs = gspread.authorize(creds) sheet = gs.open_by_url(SPREADSHEET_URL) worksheet = sheet.get_worksheet(0) df1 = df.astype(str) worksheet.update([df1.columns.values.tolist()] + df1.values.tolist()) return "資料已成功上傳到 Google Sheet!" if mode == "資料爬取": st.header("爬取台南飯店資料") if st.button("開始爬取"): df = scrape_booking_hotel() if not df.empty: st.dataframe(df) df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig') st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv") else: st.error("未能成功爬取資料") elif mode == "資料視覺化": st.header("分析與視覺化") try: df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig') # 清理數據 df['價格'] = pd.to_numeric(df['價格'], errors='coerce') # 無效值轉為 NaN df['評分'] = df['評分'].apply(clean_rating) # 清理評分數據 df = df.dropna(subset=['價格']) # 移除價格為 NaN 的行 # 顯示數據摘要 st.write(f"有效數據行數:{len(df)}") st.write(f"價格缺失值數量:{df['價格'].isna().sum()}") # 繪製圖表 st.plotly_chart(create_price_rating_scatter(df)) st.plotly_chart(create_price_distribution(df)) except Exception as e: st.error(f"讀取或分析資料時發生錯誤:{e}") elif mode == "上傳至 Google Sheet": st.header("上傳資料至 Google Sheet") try: df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig') result = upload_to_google_sheets(df) st.success(result) except Exception as e: st.error(f"上傳資料時發生錯誤:{e}")