Spaces:
Sleeping
Sleeping
File size: 5,728 Bytes
df36f6a ed96db5 fee5da3 ed96db5 fee5da3 df36f6a fee5da3 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 df36f6a fee5da3 df36f6a 6dee4b4 df36f6a 6dee4b4 df36f6a 6dee4b4 fee5da3 df36f6a 097be29 df36f6a 097be29 df36f6a 097be29 7428e13 097be29 df36f6a fee5da3 df36f6a 097be29 fee5da3 097be29 fee5da3 df36f6a 097be29 fee5da3 48f3de6 fee5da3 df36f6a 097be29 df36f6a 097be29 fee5da3 097be29 fee5da3 df36f6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import os
import re
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
import gradio as gr
def fetch_article_content(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}
try:
resp = requests.get(url, headers=headers, timeout=10)
except Exception:
return None, None, (None, None)
if resp.status_code != 200 or 'text/html' not in resp.headers.get('Content-Type', ''):
return None, None, (None, None)
soup = BeautifulSoup(resp.text, 'html.parser')
for tag in soup(['script', 'style', 'header', 'footer', 'noscript', 'form', 'nav', 'aside']):
tag.decompose()
title_tag = soup.find('h1') or soup.title
title = title_tag.get_text().strip() if title_tag else "Untitled"
if title.endswith(" - Wikipedia"):
title = title.replace(" - Wikipedia", "")
content_div = soup.find('div', {'class': 'mw-parser-output'}) or soup.body
if content_div:
for ref in content_div.find_all('sup', {'class': 'reference'}):
ref.decompose()
for ref_list in content_div.find_all(['ol', 'ul'], {'class': 'references'}):
ref_list.decompose()
paragraphs = content_div.find_all('p')
text_content = "\n\n".join(p.get_text().strip() for p in paragraphs if p.get_text().strip())
else:
text_content = soup.get_text(separator="\n")
text_content = text_content.strip()
img_url, img_alt = None, ""
imgs = content_div.find_all('img') if content_div else soup.find_all('img')
for img in imgs:
src = img.get('src', '')
alt = img.get('alt', '')
if not src:
continue
if "upload" in src or "commons" in src or "wikipedia" in src:
img_url = src
img_alt = alt if alt else ""
break
if alt.lower() not in ["logo", "icon"]:
img_url = src
img_alt = alt if alt else ""
break
if img_url:
if img_url.startswith("//"):
img_url = "https:" + img_url
elif img_url.startswith("/"):
from urllib.parse import urljoin
img_url = urljoin(url, img_url)
if not img_alt:
from urllib.parse import unquote
fname = unquote(img_url.split('/')[-1])
fname = re.sub(r'^\d+px-', '', fname)
fname = re.sub(r'\.[A-Za-z0-9]+$', '', fname)
img_alt = fname.replace('_', ' ').strip()
if not img_alt:
img_alt = "Image"
return title, text_content, (img_url, img_alt)
def generate_post(platform, title, content, model):
platform = platform.lower()
if platform == "reddit":
style_instructions = (
"an informal, conversational tone, as if posting on Reddit. "
"Format the response using HTML tags for paragraphs and lists, "
"but do not wrap it in triple backticks or ```html code blocks."
)
elif platform == "quora":
style_instructions = (
"a clear, detailed explanatory tone, as if answering on Quora. "
"Use proper HTML for readability, without wrapping in code blocks."
)
else:
style_instructions = "a clear and accessible tone"
prompt = (
f"Transform the following article content into {style_instructions}.\n"
f"Output the result in valid HTML format with proper paragraphs (and lists if needed).\n"
f"Do NOT include the title or image — only the body content in HTML.\n\n"
f"Article Title: {title}\n"
f"Article Content:\n\"\"\"\n{content}\n\"\"\""
)
try:
response = model.generate_content(prompt)
except Exception as e:
return f"<p><em>Error: failed to generate {platform} content ({e})</em></p>"
return response.text.strip()
def process_url(url, api_key):
if not api_key:
error_msg = "<p><em>API key is required.</em></p>"
return error_msg, error_msg
try:
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-pro-latest')
except Exception as e:
error_msg = f"<p><em>Failed to configure Gemini API: {e}</em></p>"
return error_msg, error_msg
title, content, (img_url, img_alt) = fetch_article_content(url)
if not content:
error_msg = f"<p><em>Could not retrieve content from this URL: <a href='{url}'>{url}</a></em></p>"
return error_msg, error_msg
reddit_body = generate_post("reddit", title, content, model)
quora_body = generate_post("quora", title, content, model)
source_html = f'<p><small><em>Source: <a href="{url}" target="_blank">{url}</a></em></small></p>'
reddit_html = f"<h2>{title}</h2>\n"
quora_html = f"<h2>{title}</h2>\n"
if img_url:
img_tag = f'<img src="{img_url}" alt="{img_alt}" style="max-width:100%; height:auto;" />\n'
reddit_html += img_tag
quora_html += img_tag
reddit_html += reddit_body + source_html
quora_html += quora_body + source_html
return reddit_html, quora_html
# Gradio interface
demo = gr.Interface(
fn=process_url,
inputs=[
gr.Textbox(label="Article URL", placeholder="https://en.wikipedia.org/wiki/Kefir"),
gr.Textbox(label="Gemini API Key", placeholder="Paste your Gemini API key here", type="password")
],
outputs=[
gr.HTML(label="Reddit-formatted Post"),
gr.HTML(label="Quora-formatted Post")
],
title="Article → Reddit & Quora Post Generator",
description="Enter an article link and your Gemini API key to generate Reddit- and Quora-style posts in HTML."
)
if __name__ == "__main__":
demo.launch()
|