Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,98 +5,6 @@ import re
|
|
5 |
from huggingface_hub import InferenceClient
|
6 |
from fpdf import FPDF
|
7 |
from datetime import datetime
|
8 |
-
import requests
|
9 |
-
from bs4 import BeautifulSoup
|
10 |
-
from requests.adapters import HTTPAdapter
|
11 |
-
from requests.packages.urllib3.util.retry import Retry
|
12 |
-
import time
|
13 |
-
|
14 |
-
def setup_session():
|
15 |
-
try:
|
16 |
-
session = requests.Session()
|
17 |
-
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
|
18 |
-
session.mount('https://', HTTPAdapter(max_retries=retries))
|
19 |
-
return session
|
20 |
-
except Exception as e:
|
21 |
-
return None
|
22 |
-
|
23 |
-
def generate_naver_search_url(query):
|
24 |
-
base_url = "https://search.naver.com/search.naver?"
|
25 |
-
params = {"ssc": "tab.blog.all", "sm": "tab_jum", "query": query}
|
26 |
-
url = base_url + "&".join(f"{key}={value}" for key, value in params.items())
|
27 |
-
return url
|
28 |
-
|
29 |
-
def crawl_blog_content(url, session):
|
30 |
-
try:
|
31 |
-
headers = {
|
32 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
|
33 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
34 |
-
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
35 |
-
"Accept-Encoding": "gzip, deflate, br",
|
36 |
-
"Connection": "keep-alive",
|
37 |
-
"Referer": "https://search.naver.com/search.naver",
|
38 |
-
}
|
39 |
-
|
40 |
-
delay = random.uniform(1, 2)
|
41 |
-
time.sleep(delay)
|
42 |
-
|
43 |
-
response = session.get(url, headers=headers)
|
44 |
-
if response.status_code != 200:
|
45 |
-
return ""
|
46 |
-
|
47 |
-
soup = BeautifulSoup(response.content, "html.parser")
|
48 |
-
content = soup.find("div", attrs={'class': 'se-main-container'})
|
49 |
-
|
50 |
-
if content:
|
51 |
-
return clean_text(content.get_text())
|
52 |
-
else:
|
53 |
-
return ""
|
54 |
-
except Exception as e:
|
55 |
-
return ""
|
56 |
-
|
57 |
-
def crawl_naver_search_results(url, session):
|
58 |
-
try:
|
59 |
-
headers = {
|
60 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
|
61 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
62 |
-
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
|
63 |
-
"Accept-Encoding": "gzip, deflate, br",
|
64 |
-
"Connection": "keep-alive",
|
65 |
-
"Referer": "https://search.naver.com/search.naver",
|
66 |
-
}
|
67 |
-
response = session.get(url, headers=headers)
|
68 |
-
if response.status_code != 200:
|
69 |
-
return []
|
70 |
-
|
71 |
-
soup = BeautifulSoup(response.content, "html.parser")
|
72 |
-
results = []
|
73 |
-
count = 0
|
74 |
-
for li in soup.find_all("li", class_=re.compile("bx.*")):
|
75 |
-
if count >= 10:
|
76 |
-
break
|
77 |
-
for div in li.find_all("div", class_="detail_box"):
|
78 |
-
for div2 in div.find_all("div", class_="title_area"):
|
79 |
-
title = div2.text.strip()
|
80 |
-
for a in div2.find_all("a", href=True):
|
81 |
-
link = a["href"]
|
82 |
-
if "blog.naver" in link:
|
83 |
-
link = link.replace("https://", "https://m.")
|
84 |
-
results.append({"์ ๋ชฉ": title, "๋งํฌ": link})
|
85 |
-
count += 1
|
86 |
-
if count >= 10:
|
87 |
-
break
|
88 |
-
if count >= 10:
|
89 |
-
break
|
90 |
-
if count >= 10:
|
91 |
-
break
|
92 |
-
|
93 |
-
return results
|
94 |
-
except Exception as e:
|
95 |
-
return []
|
96 |
-
|
97 |
-
def clean_text(text):
|
98 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
99 |
-
return text
|
100 |
|
101 |
def create_client(model_name):
|
102 |
return InferenceClient(model_name, token=os.getenv("HF_TOKEN"))
|
@@ -133,22 +41,6 @@ def generate_blog_post(category, style, topic, references1, references2, referen
|
|
133 |
formatted_text = modified_text.replace('\n', '\n\n')
|
134 |
return formatted_text
|
135 |
|
136 |
-
def fetch_references(topic):
|
137 |
-
search_url = generate_naver_search_url(topic)
|
138 |
-
session = setup_session()
|
139 |
-
if session is None:
|
140 |
-
return "Failed to set up session.", "", "", ""
|
141 |
-
results = crawl_naver_search_results(search_url, session)
|
142 |
-
if not results:
|
143 |
-
return "No results found.", "", "", ""
|
144 |
-
|
145 |
-
selected_results = random.sample(results, 3)
|
146 |
-
references1_content = f"์ ๋ชฉ: {selected_results[0]['์ ๋ชฉ']}\n๋ด์ฉ: {crawl_blog_content(selected_results[0]['๋งํฌ'], session)}"
|
147 |
-
references2_content = f"์ ๋ชฉ: {selected_results[1]['์ ๋ชฉ']}\n๋ด์ฉ: {crawl_blog_content(selected_results[1]['๋งํฌ'], session)}"
|
148 |
-
references3_content = f"์ ๋ชฉ: {selected_results[2]['์ ๋ชฉ']}\n๋ด์ฉ: {crawl_blog_content(selected_results[2]['๋งํฌ'], session)}"
|
149 |
-
|
150 |
-
return "์ฐธ๊ณ ๊ธ ์์ฑ ์๋ฃ", references1_content, references2_content, references3_content
|
151 |
-
|
152 |
def get_title_prompt(category):
|
153 |
if (category == "๊ณ ๊ฐ๋ฐ์ํ"):
|
154 |
return """
|
@@ -399,7 +291,7 @@ with gr.Blocks() as demo:
|
|
399 |
gr.Markdown(f"# {title}")
|
400 |
|
401 |
gr.Markdown("### 1๋จ๊ณ: ํฌ์คํ
์นดํ
๊ณ ๋ฆฌ๋ฅผ ์ง์ ํด์ฃผ์ธ์", elem_id="step-title")
|
402 |
-
category = gr.Radio(choices=["๊ณ ๊ฐ๋ฐ์ํ"], label="ํฌ์คํ
์นดํ
๊ณ ๋ฆฌ", value="๊ณ ๊ฐ๋ฐ์ํ")
|
403 |
|
404 |
gr.Markdown("---\n\n")
|
405 |
|
@@ -419,17 +311,6 @@ with gr.Blocks() as demo:
|
|
419 |
references2 = gr.Textbox(label="์ฐธ๊ณ ๊ธ 2", placeholder="์ฐธ๊ณ ํ ๊ธ์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ผ์ธ์", lines=10, visible=True)
|
420 |
references3 = gr.Textbox(label="์ฐธ๊ณ ๊ธ 3", placeholder="์ฐธ๊ณ ํ ๊ธ์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ผ์ธ์", lines=10, visible=True)
|
421 |
|
422 |
-
# ์งํ ์ํฉ ํ์๋ฅผ ์ํ ์ถ๋ ฅ ํ
์คํธ๋ฐ์ค
|
423 |
-
progress_output = gr.Textbox(label="์งํ ์ํฉ", lines=2, visible=True)
|
424 |
-
|
425 |
-
# ์ฐธ๊ณ ๊ธ ๊ฐ์ ธ์ค๊ธฐ ๋ฒํผ
|
426 |
-
fetch_references_btn = gr.Button("์ฐธ๊ณ ๊ธ ์์ฑํ๊ธฐ")
|
427 |
-
fetch_references_btn.click(fn=fetch_references, inputs=[topic], outputs=[progress_output, references1, references2, references3])
|
428 |
-
|
429 |
-
# ์ฐธ๊ณ ๊ธ ๋ค์ ๋ฃ๊ธฐ ๋ฒํผ
|
430 |
-
refill_btn = gr.Button("์ฐธ๊ณ ๊ธ ๋ค์ ๋ฃ๊ธฐ")
|
431 |
-
refill_btn.click(fn=fetch_references, inputs=[topic], outputs=[progress_output, references1, references2, references3])
|
432 |
-
|
433 |
gr.Markdown("---\n\n")
|
434 |
|
435 |
gr.Markdown("### 5๋จ๊ณ: ๋ธ๋ก๊ทธ ์ ๋ชฉ์ ์
๋ ฅํ์ธ์", elem_id="step-title")
|
|
|
5 |
from huggingface_hub import InferenceClient
|
6 |
from fpdf import FPDF
|
7 |
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def create_client(model_name):
|
10 |
return InferenceClient(model_name, token=os.getenv("HF_TOKEN"))
|
|
|
41 |
formatted_text = modified_text.replace('\n', '\n\n')
|
42 |
return formatted_text
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
def get_title_prompt(category):
|
45 |
if (category == "๊ณ ๊ฐ๋ฐ์ํ"):
|
46 |
return """
|
|
|
291 |
gr.Markdown(f"# {title}")
|
292 |
|
293 |
gr.Markdown("### 1๋จ๊ณ: ํฌ์คํ
์นดํ
๊ณ ๋ฆฌ๋ฅผ ์ง์ ํด์ฃผ์ธ์", elem_id="step-title")
|
294 |
+
category = gr.Radio(choices=["๊ณ ๊ฐ๋ฐ์ํ","๊ธฐ๋ฅ์ง์คํ(1๊ฐ ๊ธฐ๋ฅ)"], label="ํฌ์คํ
์นดํ
๊ณ ๋ฆฌ", value="๊ณ ๊ฐ๋ฐ์ํ")
|
295 |
|
296 |
gr.Markdown("---\n\n")
|
297 |
|
|
|
311 |
references2 = gr.Textbox(label="์ฐธ๊ณ ๊ธ 2", placeholder="์ฐธ๊ณ ํ ๊ธ์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ผ์ธ์", lines=10, visible=True)
|
312 |
references3 = gr.Textbox(label="์ฐธ๊ณ ๊ธ 3", placeholder="์ฐธ๊ณ ํ ๊ธ์ ๋ณต์ฌํ์ฌ ๋ถ์ฌ๋ฃ์ผ์ธ์", lines=10, visible=True)
|
313 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
314 |
gr.Markdown("---\n\n")
|
315 |
|
316 |
gr.Markdown("### 5๋จ๊ณ: ๋ธ๋ก๊ทธ ์ ๋ชฉ์ ์
๋ ฅํ์ธ์", elem_id="step-title")
|