File size: 6,417 Bytes
f39d20c
 
 
 
65e4f78
f39d20c
 
65e4f78
 
 
 
f39d20c
 
 
 
 
 
 
 
 
 
65e4f78
f39d20c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65e4f78
f39d20c
 
 
 
65e4f78
f39d20c
 
 
 
 
 
65e4f78
 
 
 
 
f39d20c
 
65e4f78
 
 
f39d20c
65e4f78
 
f39d20c
65e4f78
 
 
 
 
 
 
 
 
 
 
 
f39d20c
65e4f78
 
f39d20c
 
 
 
 
 
 
 
65e4f78
f39d20c
 
 
 
 
 
 
 
 
 
65e4f78
f39d20c
 
 
 
 
 
 
 
 
 
65e4f78
f39d20c
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# file_path: app.py
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from google.oauth2.service_account import Credentials
import gspread
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Google Sheets credentials
SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"

# Streamlit app
st.title("Booking.com 台南飯店資料爬取與分析")
st.sidebar.header("功能選擇")
mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])

@st.cache_data
def scrape_booking_hotel():
    url = "https://www.booking.com/searchresults.zh-tw.html"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    }
    params = {
        'ss': '台南',
        'checkin': '2024-11-16',
        'checkout': '2024-11-17',
        'group_adults': '2',
        'no_rooms': '1',
        'group_children': '0',
        'dest_id': '-2637868',
        'dest_type': 'city'
    }
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        hotels_data = []
        hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})

        for hotel in hotel_cards:
            try:
                name_elem = hotel.find('div', {'data-testid': 'title', 'class': 'f6431b446c'})
                name = name_elem.text.strip() if name_elem else "無資料"
                price_elem = hotel.find('span', {
                    'data-testid': 'price-and-discounted-price',
                    'class': 'f6431b446c'
                })
                price = price_elem.text.strip() if price_elem else "無資料"
                price = price.replace('TWD', '').replace(' ', '').replace(',', '').strip()
                rating_container = hotel.find('div', {'class': 'a3b8729ab1'})
                rating_elem = rating_container.find('div', {'class': 'ac4a7896c7'}) if rating_container else None
                rating = rating_elem.text.strip() if rating_elem else "無評分"
                description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
                if description_elem:
                    room_type = description_elem.find('h4', {'class': 'abf093bdfe'})
                    room_type = room_type.text.strip() if room_type else ""
                    bed_info = description_elem.find('div', {'class': 'abf093bdfe'})
                    bed_info = bed_info.text.strip() if bed_info else ""
                    cancellation = description_elem.find('strong', text='可免費取消')
                    cancellation = "可免費取消" if cancellation else ""
                    payment = description_elem.find('strong', text='無需訂金')
                    payment = "無需訂金" if payment else ""
                    description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
                else:
                    description = "無說明"
                hotels_data.append({
                    '飯店名稱': name,
                    '價格': price,
                    '評分': rating,
                    '說明': description
                })
            except AttributeError:
                continue

        df = pd.DataFrame(hotels_data).drop_duplicates()
        return df
    except requests.RequestException:
        return pd.DataFrame()

def clean_rating(x):
    if pd.isna(x) or x == '無評分':
        return 0
    return float(str(x).replace('分數', '').replace('分', ''))

def create_price_rating_scatter(df):
    fig = px.scatter(
        df,
        x='價格',
        y='評分',
        text='飯店名稱',
        size='價格',
        color='評分',
        title='台南飯店價格與評分關係圖',
        labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
    )
    fig.update_layout(height=600, title_x=0.5)
    return fig

def create_price_distribution(df):
    fig = go.Figure()
    fig.add_trace(go.Histogram(
        x=df['價格'],
        name='價格分布',
        nbinsx=10,
        marker_color='rgb(55, 83, 109)'
    ))
    fig.add_trace(go.Box(
        x=df['價格'],
        name='價格箱型圖',
        marker_color='rgb(26, 118, 255)'
    ))
    fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
    return fig

def upload_to_google_sheets(df):
    creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
    gs = gspread.authorize(creds)
    sheet = gs.open_by_url(SPREADSHEET_URL)
    worksheet = sheet.get_worksheet(0)
    df1 = df.astype(str)
    worksheet.update([df1.columns.values.tolist()] + df1.values.tolist())
    return "資料已成功上傳到 Google Sheet!"

if mode == "資料爬取":
    st.header("爬取台南飯店資料")
    if st.button("開始爬取"):
        df = scrape_booking_hotel()
        if not df.empty:
            st.dataframe(df)
            df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
            st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
        else:
            st.error("未能成功爬取資料")

elif mode == "資料視覺化":
    st.header("分析與視覺化")
    try:
        df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
        df['價格'] = pd.to_numeric(df['價格'], errors='coerce')
        df['評分'] = df['評分'].apply(clean_rating)
        st.plotly_chart(create_price_rating_scatter(df))
        st.plotly_chart(create_price_distribution(df))
    except Exception as e:
        st.error(f"讀取或分析資料時發生錯誤:{e}")

elif mode == "上傳至 Google Sheet":
    st.header("上傳資料至 Google Sheet")
    try:
        df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
        result = upload_to_google_sheets(df)
        st.success(result)
    except Exception as e:
        st.error(f"上傳資料時發生錯誤:{e}")