AIRider commited on
Commit
29582f1
ยท
verified ยท
1 Parent(s): 9efd64d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from requests.adapters import HTTPAdapter
3
+ from requests.packages.urllib3.util.retry import Retry
4
+ from bs4 import BeautifulSoup
5
+ import gradio as gr
6
+ import datetime
7
+ import pandas as pd
8
+ import xlsxwriter
9
+
10
+ def setup_session():
11
+ session = requests.Session()
12
+ retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
13
+ session.mount('https://', HTTPAdapter(max_retries=retries))
14
+ return session
15
+
16
+ def get_base_url(board_select):
17
+ urls = {
18
+ "๋ง˜์ด๋ฒ ๋ฒ ": "https://cafe.naver.com/ArticleList.nhn?search.clubid=29434212&search.menuid=2&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=29434212",
19
+ "๋ง˜์Šคํ™€๋ฆญ": "https://cafe.naver.com/ArticleList.nhn?search.clubid=10094499&search.menuid=599&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=10094499",
20
+ "๊ด‘์ฃผ๋ง˜": "https://cafe.naver.com/ArticleList.nhn?search.clubid=26025763&search.menuid=508&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=26025763",
21
+ "์‡ผํ•‘์ง€๋ฆ„์‹ ": "https://cafe.naver.com/ArticleList.nhn?search.clubid=25729954&search.menuid=751&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=25729954",
22
+ "๋ถ€์‚ฐ๋ง˜": "https://cafe.naver.com/ArticleList.nhn?search.clubid=28707025&search.menuid=282&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=28707025",
23
+ "์ง„ํฌ๋ง˜": "https://cafe.naver.com/ArticleList.nhn?search.clubid=21442290&search.menuid=476&search.boardtype=L&userDisplay=50&search.specialmenutype=&search.totalCount=501&search.cafeId=21442290"
24
+ }
25
+ return urls.get(board_select, "Invalid board selected")
26
+
27
+ def convert_views(view_string):
28
+ if '๋งŒ' in view_string:
29
+ number_part = view_string.replace('๋งŒ', '')
30
+ return int(float(number_part) * 10000)
31
+ return int(view_string.replace(",", ""))
32
+
33
+ def extract_data_to_excel_and_html(page, board_select):
34
+ session = setup_session()
35
+ base_url = get_base_url(board_select)
36
+ if base_url == "Invalid board selected":
37
+ return "Invalid board selected", ""
38
+
39
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
40
+ today = datetime.datetime.now().strftime("%Y%m%d")
41
+ workbook = xlsxwriter.Workbook(f'{board_select}_{today}.xlsx')
42
+ worksheet = workbook.add_worksheet()
43
+ worksheet.write('A1', '์ œ๋ชฉ')
44
+ worksheet.write('B1', '์ž‘์„ฑ์ผ')
45
+ worksheet.write('C1', '์กฐํšŒ์ˆ˜')
46
+ worksheet.write('D1', '์ข‹์•„์š”')
47
+
48
+ html_output = "<table style='width:100%; border: 1px solid black;'><tr><th>์ œ๋ชฉ</th><th>์ž‘์„ฑ์ผ</th><th>์กฐํšŒ์ˆ˜</th><th>์ข‹์•„์š”</th></tr>"
49
+ row = 1
50
+ for p in range(1, page + 1):
51
+ url = f"{base_url}&search.page={p}"
52
+ response = session.get(url, headers=headers)
53
+ soup = BeautifulSoup(response.text, 'html.parser')
54
+ article_boards = soup.find_all('div', class_='article-board m-tcol-c')
55
+ if len(article_boards) < 2:
56
+ continue
57
+ article_board = article_boards[1]
58
+ rows = article_board.find_all('tr')
59
+ current_date = datetime.datetime.now().strftime("%Y.%m.%d")
60
+
61
+ for row_data in rows:
62
+ try:
63
+ a_tag = row_data.find('a', class_='article')
64
+ if a_tag:
65
+ link = a_tag['href']
66
+ title = a_tag.get_text(strip=True)
67
+ full_link = f"https://cafe.naver.com{link}"
68
+ views = convert_views(row_data.find('td', class_='td_view').get_text(strip=True))
69
+ likes = int(row_data.find('td', class_='td_likes').get_text(strip=True).replace(",", ""))
70
+ date = row_data.find('td', class_='td_date').get_text(strip=True)
71
+ if ":" in date:
72
+ date = current_date
73
+
74
+ worksheet.write_url(row, 0, full_link, string=title)
75
+ worksheet.write(row, 1, date)
76
+ worksheet.write_number(row, 2, views)
77
+ worksheet.write_number(row, 3, likes)
78
+
79
+ html_output += f"<tr><td><a href='{full_link}' target='_blank'>{title}</a></td><td>{date}</td><td>{views}</td><td>{likes}</td></tr>"
80
+
81
+ row += 1
82
+ except AttributeError:
83
+ continue
84
+
85
+ workbook.close()
86
+ html_output += "</table>"
87
+ return f"{board_select}_{today}.xlsx", html_output
88
+
89
+ with gr.Blocks() as demo:
90
+ gr.Markdown("# N์‚ฌ Cafe ํ•ซ๋”œ ๊ฒŒ์‹œํŒ ํฌ๋กค๋ง")
91
+ gr.Markdown("""
92
+ ํŽ˜์ด์ง€๋ฅผ ์ž…๋ ฅํ•˜๋ฉด ๊ฒฐ๊ณผ๋ฅผ ์ถœ๋ ฅํ•ฉ๋‹ˆ๋‹ค.<br>
93
+ ์ตœ๋Œ€ ํŽ˜์ด์ง€์ˆ˜๋Š” 50ํŽ˜์ด์ง€ ์ž…๋‹ˆ๋‹ค.
94
+ ํ™œ์šฉ๋ฒ• ๋ฐ ์ด์ปค๋จธ์Šค ๊ต์œก ๋ฌธ์˜๋Š” *** ์œผ๋กœ ์ฃผ์„ธ์š”.
95
+ """)
96
+ board_select = gr.Radio(["๋ง˜์ด๋ฒ ๋ฒ ", "๋ง˜์Šคํ™€๋ฆญ", "๊ด‘์ฃผ๋ง˜", "์‡ผํ•‘์ง€๋ฆ„์‹ ", "๋ถ€์‚ฐ๋ง˜", "์ง„ํฌ๋ง˜"], label="๊ฒŒ์‹œํŒ์„ ์„ ํƒํ•˜์„ธ์š”")
97
+ with gr.Row():
98
+ inp = gr.Number(label="์ˆ˜์ง‘์„ ์›ํ•˜์‹œ๋Š” ํŽ˜์ด์ง€ ์ˆ˜๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”", value=1)
99
+ btn = gr.Button("์ˆ˜์ง‘")
100
+ output_file = gr.File(label="์—‘์…€ํŒŒ์ผ๋กœ ๋‹ค์šด๋กœ๋“œ")
101
+ output_html = gr.HTML()
102
+ btn.click(fn=extract_data_to_excel_and_html, inputs=[inp, board_select], outputs=[output_file, output_html])
103
+
104
+ if __name__ == "__main__":
105
+ demo.launch(share=True)