Ethscriptions commited on
Commit
7277266
·
verified ·
1 Parent(s): 14ad894

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import streamlit as st
4
+ from playwright.sync_api import sync_playwright, Error
5
+ from bs4 import BeautifulSoup
6
+ import pandas as pd
7
+ import time
8
+
9
+ # --- 配置页面 ---
10
+ st.set_page_config(page_title="实时电影场次查询", page_icon="🎬")
11
+ st.title("🎬 实时电影场次查询")
12
+
13
+ # --- Playwright 抓取函数 ---
14
+ # 使用Streamlit的缓存机制,避免每次刷新都重新抓取。数据会缓存10分钟。
15
+ @st.cache_data(ttl=600)
16
+ def fetch_movie_data_live(url: str):
17
+ st.info("正在启动后台浏览器,实时抓取数据,请稍候...")
18
+ all_showtimes_data = []
19
+
20
+ try:
21
+ with sync_playwright() as p:
22
+ # 连接到一个预装在Docker环境中的Chromium浏览器
23
+ browser = p.chromium.launch(headless=True)
24
+ page = browser.new_page()
25
+
26
+ try:
27
+ page.goto(url, timeout=60000)
28
+ # 等待关键元素出现
29
+ page.wait_for_selector('div.movie-list-container', timeout=30000)
30
+ time.sleep(2) # 额外等待渲染
31
+ html_content = page.content()
32
+ finally:
33
+ # 确保浏览器被关闭
34
+ browser.close()
35
+
36
+ st.info("数据抓取完成,正在解析...")
37
+
38
+ soup = BeautifulSoup(html_content, 'html.parser')
39
+ movie_show_lists = soup.find_all('div', class_='show-list')
40
+
41
+ if not movie_show_lists:
42
+ st.error("解析失败:在页面中未找到电影列表。")
43
+ return pd.DataFrame()
44
+
45
+ for movie_section in movie_show_lists:
46
+ movie_name_tag = movie_section.find('h2', class_='movie-name')
47
+ if not movie_name_tag:
48
+ continue
49
+ movie_name = movie_name_tag.text.strip()
50
+
51
+ date_tags = movie_section.find_all('span', class_='date-item')
52
+ plist_containers = movie_section.find_all('div', class_='plist-container')
53
+
54
+ for date_tag in date_tags:
55
+ viewing_date = ' '.join(date_tag.text.strip().split())
56
+ date_index = date_tag.get('data-index')
57
+
58
+ correct_plist = next((p for p in plist_containers if p.get('data-index') == date_index), None)
59
+
60
+ if not correct_plist:
61
+ continue
62
+
63
+ for show_row in correct_plist.find('tbody').find_all('tr'):
64
+ try:
65
+ start_time = show_row.find('span', class_='begin-time').text.strip()
66
+ end_time_raw = show_row.find('span', class_='end-time').text.strip()
67
+ end_time = end_time_raw.replace('散场', '')
68
+ language = show_row.find('span', class_='lang').text.strip()
69
+ hall = show_row.find('span', class_='hall').text.strip()
70
+
71
+ all_showtimes_data.append({
72
+ '电影名称': movie_name,
73
+ '观影日期': viewing_date,
74
+ '开始时间': start_time,
75
+ '结束时间': end_time,
76
+ '语言版本': language,
77
+ '影厅': hall,
78
+ })
79
+ except AttributeError:
80
+ continue
81
+
82
+ if not all_showtimes_data:
83
+ return pd.DataFrame()
84
+
85
+ return pd.DataFrame(all_showtimes_data)
86
+
87
+ except Error as e:
88
+ st.error(f"Playwright 在服务器上运行时出错: {e}")
89
+ st.error("这可能是由于Hugging Face服务器资源临时紧张或目标网站反爬虫策略导致。请稍后刷新重试。")
90
+ return pd.DataFrame()
91
+
92
+
93
+ # --- 主应用逻辑 ---
94
+ cinema_url = "https://www.maoyan.com/cinema/15050?poi=97785807"
95
+ df = fetch_movie_data_live(cinema_url)
96
+
97
+ if not df.empty:
98
+ st.success("实时数据加载成功!")
99
+
100
+ # UI 界面
101
+ all_movies = df['电影名称'].unique()
102
+ selected_movie = st.selectbox("请选择电影:", all_movies)
103
+
104
+ if selected_movie:
105
+ st.subheader(f"《{selected_movie}》的场次信息")
106
+ filtered_df = df[df['电影名称'] == selected_movie]
107
+
108
+ all_dates = filtered_df['观影日期'].unique()
109
+ selected_date = st.selectbox("请选择日期:", all_dates)
110
+
111
+ if selected_date:
112
+ final_df = filtered_df[filtered_df['观影日期'] == selected_date].drop(columns=['电影名称', '观影日期']).reset_index(drop=True)
113
+ st.dataframe(final_df, use_container_width=True)
114
+ else:
115
+ st.warning("未能加载到任何场次信息。")