Spaces:
Sleeping
Sleeping
import os | |
import re | |
import requests | |
from bs4 import BeautifulSoup | |
import google.generativeai as genai | |
import gradio as gr | |
def fetch_article_content(url): | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" | |
} | |
try: | |
resp = requests.get(url, headers=headers, timeout=10) | |
except Exception: | |
return None, None, (None, None) | |
if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type', ''): | |
return None, None, (None, None) | |
soup = BeautifulSoup(resp.text, 'html.parser') | |
for tag in soup(['script', 'style', 'header', 'footer', 'noscript', 'form', 'nav', 'aside']): | |
tag.decompose() | |
title_tag = soup.find('h1') or soup.title | |
title = title_tag.get_text().strip() if title_tag else "Untitled" | |
if title.endswith(" - Wikipedia"): | |
title = title.replace(" - Wikipedia", "") | |
content_div = soup.find('div', {'class': 'mw-parser-output'}) or soup.body | |
if content_div: | |
for ref in content_div.find_all('sup', {'class': 'reference'}): | |
ref.decompose() | |
for ref_list in content_div.find_all(['ol', 'ul'], {'class': 'references'}): | |
ref_list.decompose() | |
paragraphs = content_div.find_all('p') | |
text_content = "\n\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip()) | |
else: | |
text_content = soup.get_text(separator="\n") | |
text_content = text_content.strip() | |
img_url, img_alt = None, "" | |
imgs = content_div.find_all('img') if content_div else soup.find_all('img') | |
for img in imgs: | |
src = img.get('src', '') | |
alt = img.get('alt', '') | |
if not src: | |
continue | |
if "upload" in src or "commons" in src or "wikipedia" in src: | |
img_url = src | |
img_alt = alt if alt else "" | |
break | |
if alt.lower() not in ["logo", "icon"]: | |
img_url = src | |
img_alt = alt if alt else "" | |
break | |
if img_url: | |
if img_url.startswith("//"): | |
img_url = "https:" + img_url | |
elif img_url.startswith("/"): | |
from urllib.parse import urljoin | |
img_url = urljoin(url, img_url) | |
if not img_alt: | |
from urllib.parse import unquote | |
fname = unquote(img_url.split('/')[-1]) | |
fname = re.sub(r'^\d+px-', '', fname) | |
fname = re.sub(r'\.[A-Za-z0-9]+$', '', fname) | |
img_alt = fname.replace('_', ' ').strip() | |
if not img_alt: | |
img_alt = "Image" | |
return title, text_content, (img_url, img_alt) | |
def generate_post(platform, title, content, model): | |
platform = platform.lower() | |
if platform == "reddit": | |
style_instructions = ( | |
"an informal, conversational tone, as if posting on Reddit. " | |
"Format the response using HTML tags for paragraphs and lists, " | |
"but do not wrap it in triple backticks or ```html code blocks." | |
) | |
elif platform == "quora": | |
style_instructions = ( | |
"a clear, detailed explanatory tone, as if answering on Quora. " | |
"Use proper HTML for readability, without wrapping in code blocks." | |
) | |
else: | |
style_instructions = "a clear and accessible tone" | |
prompt = ( | |
f"Transform the following article content into {style_instructions}.\n" | |
f"Output the result in valid HTML format with proper paragraphs (and lists if needed).\n" | |
f"Do NOT include the title or image — only the body content in HTML.\n\n" | |
f"Article Title: {title}\n" | |
f"Article Content:\n\"\"\"\n{content}\n\"\"\"" | |
) | |
try: | |
response = model.generate_content(prompt) | |
except Exception as e: | |
return f"<p><em>Error: failed to generate {platform} content ({e})</em></p>" | |
return response.text.strip() | |
def process_url(url, api_key): | |
if not api_key: | |
error_msg = "<p><em>API key is required.</em></p>" | |
return error_msg, error_msg | |
try: | |
genai.configure(api_key=api_key) | |
model = genai.GenerativeModel('gemini-1.5-pro-latest') | |
except Exception as e: | |
error_msg = f"<p><em>Failed to configure Gemini API: {e}</em></p>" | |
return error_msg, error_msg | |
title, content, (img_url, img_alt) = fetch_article_content(url) | |
if not content: | |
error_msg = f"<p><em>Could not retrieve content from this URL: <a href='{url}'>{url}</a></em></p>" | |
return error_msg, error_msg | |
reddit_body = generate_post("reddit", title, content, model) | |
quora_body = generate_post("quora", title, content, model) | |
source_html = f'<p><small><em>Source: <a href="{url}" target="_blank">{url}</a></em></small></p>' | |
reddit_html = f"<h2>{title}</h2>\n" | |
quora_html = f"<h2>{title}</h2>\n" | |
if img_url: | |
img_tag = f'<img src="{img_url}" alt="{img_alt}" style="max-width:100%; height:auto;" />\n' | |
reddit_html += img_tag | |
quora_html += img_tag | |
reddit_html += reddit_body + source_html | |
quora_html += quora_body + source_html | |
return reddit_html, quora_html | |
# Gradio interface | |
demo = gr.Interface( | |
fn=process_url, | |
inputs=[ | |
gr.Textbox(label="Article URL", placeholder="https://en.wikipedia.org/wiki/Kefir"), | |
gr.Textbox(label="Gemini API Key", placeholder="Paste your Gemini API key here", type="password") | |
], | |
outputs=[ | |
gr.HTML(label="Reddit-formatted Post"), | |
gr.HTML(label="Quora-formatted Post") | |
], | |
title="Article → Reddit & Quora Post Generator", | |
description="Enter an article link and your Gemini API key to generate Reddit- and Quora-style posts in HTML." | |
) | |
if __name__ == "__main__": | |
demo.launch() | |