import os import re import requests from bs4 import BeautifulSoup import google.generativeai as genai import gradio as gr def fetch_article_content(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" } try: resp = requests.get(url, headers=headers, timeout=10) except Exception: return None, None, (None, None) if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type', ''): return None, None, (None, None) soup = BeautifulSoup(resp.text, 'html.parser') for tag in soup(['script', 'style', 'header', 'footer', 'noscript', 'form', 'nav', 'aside']): tag.decompose() title_tag = soup.find('h1') or soup.title title = title_tag.get_text().strip() if title_tag else "Untitled" if title.endswith(" - Wikipedia"): title = title.replace(" - Wikipedia", "") content_div = soup.find('div', {'class': 'mw-parser-output'}) or soup.body if content_div: for ref in content_div.find_all('sup', {'class': 'reference'}): ref.decompose() for ref_list in content_div.find_all(['ol', 'ul'], {'class': 'references'}): ref_list.decompose() paragraphs = content_div.find_all('p') text_content = "\n\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip()) else: text_content = soup.get_text(separator="\n") text_content = text_content.strip() img_url, img_alt = None, "" imgs = content_div.find_all('img') if content_div else soup.find_all('img') for img in imgs: src = img.get('src', '') alt = img.get('alt', '') if not src: continue if "upload" in src or "commons" in src or "wikipedia" in src: img_url = src img_alt = alt if alt else "" break if alt.lower() not in ["logo", "icon"]: img_url = src img_alt = alt if alt else "" break if img_url: if img_url.startswith("//"): img_url = "https:" + img_url elif img_url.startswith("/"): from urllib.parse import urljoin img_url = urljoin(url, img_url) if not img_alt: from urllib.parse import unquote fname = unquote(img_url.split('/')[-1]) fname = re.sub(r'^\d+px-', '', fname) fname = re.sub(r'\.[A-Za-z0-9]+$', '', fname) img_alt = fname.replace('_', ' ').strip() if not img_alt: img_alt = "Image" return title, text_content, (img_url, img_alt) def generate_post(platform, title, content, model): platform = platform.lower() if platform == "reddit": style_instructions = ( "an informal, conversational tone, as if posting on Reddit. " "Format the response using HTML tags for paragraphs and lists, " "but do not wrap it in triple backticks or ```html code blocks." ) elif platform == "quora": style_instructions = ( "a clear, detailed explanatory tone, as if answering on Quora. " "Use proper HTML for readability, without wrapping in code blocks." ) else: style_instructions = "a clear and accessible tone" prompt = ( f"Transform the following article content into {style_instructions}.\n" f"Output the result in valid HTML format with proper paragraphs (and lists if needed).\n" f"Do NOT include the title or image — only the body content in HTML.\n\n" f"Article Title: {title}\n" f"Article Content:\n\"\"\"\n{content}\n\"\"\"" ) try: response = model.generate_content(prompt) except Exception as e: return f"

Error: failed to generate {platform} content ({e})

" return response.text.strip() def process_url(url, api_key): if not api_key: error_msg = "

API key is required.

" return error_msg, error_msg try: genai.configure(api_key=api_key) model = genai.GenerativeModel('gemini-1.5-pro-latest') except Exception as e: error_msg = f"

Failed to configure Gemini API: {e}

" return error_msg, error_msg title, content, (img_url, img_alt) = fetch_article_content(url) if not content: error_msg = f"

Could not retrieve content from this URL: {url}

" return error_msg, error_msg reddit_body = generate_post("reddit", title, content, model) quora_body = generate_post("quora", title, content, model) source_html = f'

Source: {url}

' reddit_html = f"

{title}

\n" quora_html = f"

{title}

\n" if img_url: img_tag = f'{img_alt}\n' reddit_html += img_tag quora_html += img_tag reddit_html += reddit_body + source_html quora_html += quora_body + source_html return reddit_html, quora_html # Gradio interface demo = gr.Interface( fn=process_url, inputs=[ gr.Textbox(label="Article URL", placeholder="https://en.wikipedia.org/wiki/Kefir"), gr.Textbox(label="Gemini API Key", placeholder="Paste your Gemini API key here", type="password") ], outputs=[ gr.HTML(label="Reddit-formatted Post"), gr.HTML(label="Quora-formatted Post") ], title="Article → Reddit & Quora Post Generator", description="Enter an article link and your Gemini API key to generate Reddit- and Quora-style posts in HTML." ) if __name__ == "__main__": demo.launch()