Spaces:

mroccuper
/

Article-to-Post

Sleeping

File size: 5,728 Bytes

df36f6a
 
 
 
 
 
 
 
ed96db5
fee5da3
ed96db5
fee5da3
df36f6a
fee5da3
097be29
df36f6a
097be29
df36f6a
 
097be29
df36f6a
 
 
097be29
 
 
df36f6a
 
097be29
df36f6a
 
 
 
097be29
df36f6a
 
 
 
 
097be29
df36f6a
097be29
df36f6a
 
 
 
 
 
 
 
097be29
 
df36f6a
097be29
 
 
df36f6a
097be29
df36f6a
 
 
 
 
 
097be29
df36f6a
 
 
097be29
 
df36f6a
 
 
097be29
df36f6a
 
097be29
df36f6a
fee5da3
df36f6a
6dee4b4
 
 
 
 
df36f6a
6dee4b4
 
 
 
df36f6a
6dee4b4
fee5da3
df36f6a
 
097be29
 
df36f6a
 
 
097be29
df36f6a
 
 
 
097be29
 
 
 
 
 
 
 
 
 
7428e13
097be29
 
 
 
df36f6a
 
fee5da3
df36f6a
097be29
 
 
 
fee5da3
 
097be29
 
fee5da3
df36f6a
097be29
 
 
fee5da3
48f3de6
fee5da3
 
df36f6a
 
097be29
df36f6a
 
097be29
 
fee5da3
097be29
 
 
 
 
 
fee5da3
df36f6a

import os
import re
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
import gradio as gr

def fetch_article_content(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }

    try:
        resp = requests.get(url, headers=headers, timeout=10)
    except Exception:
        return None, None, (None, None)

    if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type', ''):
        return None, None, (None, None)

    soup = BeautifulSoup(resp.text, 'html.parser')
    for tag in soup(['script', 'style', 'header', 'footer', 'noscript', 'form', 'nav', 'aside']):
        tag.decompose()

    title_tag = soup.find('h1') or soup.title
    title = title_tag.get_text().strip() if title_tag else "Untitled"
    if title.endswith(" - Wikipedia"):
        title = title.replace(" - Wikipedia", "")

    content_div = soup.find('div', {'class': 'mw-parser-output'}) or soup.body
    if content_div:
        for ref in content_div.find_all('sup', {'class': 'reference'}):
            ref.decompose()
        for ref_list in content_div.find_all(['ol', 'ul'], {'class': 'references'}):
            ref_list.decompose()
        paragraphs = content_div.find_all('p')
        text_content = "\n\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
    else:
        text_content = soup.get_text(separator="\n")

    text_content = text_content.strip()

    img_url, img_alt = None, ""
    imgs = content_div.find_all('img') if content_div else soup.find_all('img')
    for img in imgs:
        src = img.get('src', '')
        alt = img.get('alt', '')
        if not src:
            continue
        if "upload" in src or "commons" in src or "wikipedia" in src:
            img_url = src
            img_alt = alt if alt else ""
            break
        if alt.lower() not in ["logo", "icon"]:
            img_url = src
            img_alt = alt if alt else ""
            break

    if img_url:
        if img_url.startswith("//"):
            img_url = "https:" + img_url
        elif img_url.startswith("/"):
            from urllib.parse import urljoin
            img_url = urljoin(url, img_url)

        if not img_alt:
            from urllib.parse import unquote
            fname = unquote(img_url.split('/')[-1])
            fname = re.sub(r'^\d+px-', '', fname)
            fname = re.sub(r'\.[A-Za-z0-9]+$', '', fname)
            img_alt = fname.replace('_', ' ').strip()
            if not img_alt:
                img_alt = "Image"

    return title, text_content, (img_url, img_alt)

def generate_post(platform, title, content, model):
    platform = platform.lower()

    if platform == "reddit":
        style_instructions = (
            "an informal, conversational tone, as if posting on Reddit. "
            "Format the response using HTML tags for paragraphs and lists, "
            "but do not wrap it in triple backticks or ```html code blocks."
        )
    elif platform == "quora":
        style_instructions = (
            "a clear, detailed explanatory tone, as if answering on Quora. "
            "Use proper HTML for readability, without wrapping in code blocks."
        )
    else:
        style_instructions = "a clear and accessible tone"

    prompt = (
        f"Transform the following article content into {style_instructions}.\n"
        f"Output the result in valid HTML format with proper paragraphs (and lists if needed).\n"
        f"Do NOT include the title or image — only the body content in HTML.\n\n"
        f"Article Title: {title}\n"
        f"Article Content:\n\"\"\"\n{content}\n\"\"\""
    )

    try:
        response = model.generate_content(prompt)
    except Exception as e:
        return f"<p><em>Error: failed to generate {platform} content ({e})</em></p>"

    return response.text.strip()

def process_url(url, api_key):
    if not api_key:
        error_msg = "<p><em>API key is required.</em></p>"
        return error_msg, error_msg

    try:
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-pro-latest')
    except Exception as e:
        error_msg = f"<p><em>Failed to configure Gemini API: {e}</em></p>"
        return error_msg, error_msg

    title, content, (img_url, img_alt) = fetch_article_content(url)
    if not content:
        error_msg = f"<p><em>Could not retrieve content from this URL: <a href='{url}'>{url}</a></em></p>"
        return error_msg, error_msg

    reddit_body = generate_post("reddit", title, content, model)
    quora_body = generate_post("quora", title, content, model)

    source_html = f'<p><small><em>Source: <a href="{url}" target="_blank">{url}</a></em></small></p>'

    reddit_html = f"<h2>{title}</h2>\n"
    quora_html = f"<h2>{title}</h2>\n"

    if img_url:
        img_tag = f'<img src="{img_url}" alt="{img_alt}" style="max-width:100%; height:auto;" />\n'
        reddit_html += img_tag
        quora_html += img_tag

    reddit_html += reddit_body + source_html
    quora_html += quora_body + source_html

    return reddit_html, quora_html

# Gradio interface
demo = gr.Interface(
    fn=process_url,
    inputs=[
        gr.Textbox(label="Article URL", placeholder="https://en.wikipedia.org/wiki/Kefir"),
        gr.Textbox(label="Gemini API Key", placeholder="Paste your Gemini API key here", type="password")
    ],
    outputs=[
        gr.HTML(label="Reddit-formatted Post"),
        gr.HTML(label="Quora-formatted Post")
    ],
    title="Article → Reddit & Quora Post Generator",
    description="Enter an article link and your Gemini API key to generate Reddit- and Quora-style posts in HTML."
)

if __name__ == "__main__":
    demo.launch()