Article-to-Post / app.py
mroccuper's picture
Update app.py
fee5da3 verified
import os
import re
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
import gradio as gr
def fetch_article_content(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
try:
resp = requests.get(url, headers=headers, timeout=10)
except Exception:
return None, None, (None, None)
if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type', ''):
return None, None, (None, None)
soup = BeautifulSoup(resp.text, 'html.parser')
for tag in soup(['script', 'style', 'header', 'footer', 'noscript', 'form', 'nav', 'aside']):
tag.decompose()
title_tag = soup.find('h1') or soup.title
title = title_tag.get_text().strip() if title_tag else "Untitled"
if title.endswith(" - Wikipedia"):
title = title.replace(" - Wikipedia", "")
content_div = soup.find('div', {'class': 'mw-parser-output'}) or soup.body
if content_div:
for ref in content_div.find_all('sup', {'class': 'reference'}):
ref.decompose()
for ref_list in content_div.find_all(['ol', 'ul'], {'class': 'references'}):
ref_list.decompose()
paragraphs = content_div.find_all('p')
text_content = "\n\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
else:
text_content = soup.get_text(separator="\n")
text_content = text_content.strip()
img_url, img_alt = None, ""
imgs = content_div.find_all('img') if content_div else soup.find_all('img')
for img in imgs:
src = img.get('src', '')
alt = img.get('alt', '')
if not src:
continue
if "upload" in src or "commons" in src or "wikipedia" in src:
img_url = src
img_alt = alt if alt else ""
break
if alt.lower() not in ["logo", "icon"]:
img_url = src
img_alt = alt if alt else ""
break
if img_url:
if img_url.startswith("//"):
img_url = "https:" + img_url
elif img_url.startswith("/"):
from urllib.parse import urljoin
img_url = urljoin(url, img_url)
if not img_alt:
from urllib.parse import unquote
fname = unquote(img_url.split('/')[-1])
fname = re.sub(r'^\d+px-', '', fname)
fname = re.sub(r'\.[A-Za-z0-9]+$', '', fname)
img_alt = fname.replace('_', ' ').strip()
if not img_alt:
img_alt = "Image"
return title, text_content, (img_url, img_alt)
def generate_post(platform, title, content, model):
platform = platform.lower()
if platform == "reddit":
style_instructions = (
"an informal, conversational tone, as if posting on Reddit. "
"Format the response using HTML tags for paragraphs and lists, "
"but do not wrap it in triple backticks or ```html code blocks."
)
elif platform == "quora":
style_instructions = (
"a clear, detailed explanatory tone, as if answering on Quora. "
"Use proper HTML for readability, without wrapping in code blocks."
)
else:
style_instructions = "a clear and accessible tone"
prompt = (
f"Transform the following article content into {style_instructions}.\n"
f"Output the result in valid HTML format with proper paragraphs (and lists if needed).\n"
f"Do NOT include the title or image — only the body content in HTML.\n\n"
f"Article Title: {title}\n"
f"Article Content:\n\"\"\"\n{content}\n\"\"\""
)
try:
response = model.generate_content(prompt)
except Exception as e:
return f"<p><em>Error: failed to generate {platform} content ({e})</em></p>"
return response.text.strip()
def process_url(url, api_key):
if not api_key:
error_msg = "<p><em>API key is required.</em></p>"
return error_msg, error_msg
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
except Exception as e:
error_msg = f"<p><em>Failed to configure Gemini API: {e}</em></p>"
return error_msg, error_msg
title, content, (img_url, img_alt) = fetch_article_content(url)
if not content:
error_msg = f"<p><em>Could not retrieve content from this URL: <a href='{url}'>{url}</a></em></p>"
return error_msg, error_msg
reddit_body = generate_post("reddit", title, content, model)
quora_body = generate_post("quora", title, content, model)
source_html = f'<p><small><em>Source: <a href="{url}" target="_blank">{url}</a></em></small></p>'
reddit_html = f"<h2>{title}</h2>\n"
quora_html = f"<h2>{title}</h2>\n"
if img_url:
img_tag = f'<img src="{img_url}" alt="{img_alt}" style="max-width:100%; height:auto;" />\n'
reddit_html += img_tag
quora_html += img_tag
reddit_html += reddit_body + source_html
quora_html += quora_body + source_html
return reddit_html, quora_html
# Gradio interface
demo = gr.Interface(
fn=process_url,
inputs=[
gr.Textbox(label="Article URL", placeholder="https://en.wikipedia.org/wiki/Kefir"),
gr.Textbox(label="Gemini API Key", placeholder="Paste your Gemini API key here", type="password")
],
outputs=[
gr.HTML(label="Reddit-formatted Post"),
gr.HTML(label="Quora-formatted Post")
],
title="Article → Reddit & Quora Post Generator",
description="Enter an article link and your Gemini API key to generate Reddit- and Quora-style posts in HTML."
)
if __name__ == "__main__":
demo.launch()