Spaces:

MicroHealth
/

website-to-pdf

Sleeping

App Files Files Community

bluenevus commited on Apr 25

Commit

459429d

verified ·

1 Parent(s): e82072d

Update app.py

Browse files

Files changed (1) hide show

app.py +281 -177

app.py CHANGED Viewed

@@ -2,197 +2,301 @@ import dash
 from dash import dcc, html, Input, Output, State
 import dash_bootstrap_components as dbc
 from dash.exceptions import PreventUpdate
-import base64
 import requests
-from bs4 import BeautifulSoup
-from urllib.parse import urljoin, urlparse
-from fpdf import FPDF
-import re
-import logging
-import asyncio
-import aiohttp
-from aiolimiter import AsyncLimiter
-import sqlite3
-from contextlib import contextmanager
-from threading import local
-import time
-import os
-import ssl
-from io import BytesIO
 import tempfile
-import uuid
-from concurrent.futures import ThreadPoolExecutor
-from PyPDF2 import PdfMerger
 # Initialize Dash app
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
-server = app.server
-# Logging setup
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# Thread-local storage for database connections
-thread_local = local()
-# Rate limiter: 10 requests per second
-rate_limiter = AsyncLimiter(10, 1)
-# Create an SSL context that ignores certificate verification
-ssl_context = ssl.create_default_context()
-ssl_context.check_hostname = False
-ssl_context.verify_mode = ssl.CERT_NONE
-# ThreadPoolExecutor for background tasks
-executor = ThreadPoolExecutor(max_workers=4)
-@contextmanager
-def get_db_connection():
-    if not hasattr(thread_local, "connection"):
-        thread_local.connection = sqlite3.connect('crawl_cache.db')
-    try:
-        yield thread_local.connection
-    finally:
-        pass  # We'll keep the connection open for reuse
-def init_db():
-    with get_db_connection() as conn:
-        c = conn.cursor()
-        c.execute('''CREATE TABLE IF NOT EXISTS pages
-                     (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
-        c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
-        conn.commit()
-init_db()
-def clean_text(text):
-    text = ''.join(char for char in text if char.isprintable())
-    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
-    return text
-async def get_page_content(session, url):
-    try:
-        async with rate_limiter:
-            async with session.get(url, timeout=30) as response:
-                if response.status == 200:
-                    text = await response.text()
-                    soup = BeautifulSoup(text, 'html.parser')
-                    content = []
-                    main_content = soup.find('article') or soup.find('main') or soup
-                    if main_content:
-                        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
-                            for element in main_content.find_all(tag):
-                                text = clean_text(element.get_text(strip=True))
-                                if text:
-                                    content.append(text)
-                    logger.info(f"Found {len(content)} content items for {url}")
-                    return content
-                else:
-                    logger.error(f"Error fetching {url}: HTTP {response.status}")
-                    return [f"Error fetching {url}: HTTP {response.status}"]
-    except Exception as e:
-        logger.error(f"Error processing {url}: {str(e)}")
-        return [f"Error processing {url}: {str(e)}"]
-async def get_links(session, url, base_url):
     try:
-        async with rate_limiter:
-            async with session.get(url, timeout=30) as response:
-                if response.status == 200:
-                    text = await response.text()
-                    soup = BeautifulSoup(text, 'html.parser')
-                    links = soup.find_all('a', href=True)
-                    valid_links = []
-                    for link in links:
-                        full_url = urljoin(url, link['href'])
-                        if full_url.startswith(base_url) and full_url != url:
-                            valid_links.append(full_url)
-                    return valid_links
-                else:
-                    logger.error(f"Error fetching links from {url}: HTTP {response.status}")
-                    return []
     except Exception as e:
-        logger.error(f"Error getting links from {url}: {str(e)}")
-        return []
-async def crawl_pages(base_url, max_depth):
-    visited = set()
-    to_visit = [(base_url, 0)]
-    all_pages = []
-    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_context)) as session:
-        while to_visit:
-            current_url, depth = to_visit.pop(0)
-            if current_url in visited or depth > max_depth:
-                continue
-            visited.add(current_url)
-            start_time = time.time()
-            try:
-                with get_db_connection() as conn:
-                    c = conn.cursor()
-                    c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
-                    result = c.fetchone()
-                if result:
-                    content = eval(result[0])  # Convert string back to list
-                else:
-                    content = await get_page_content(session, current_url)
-                    with get_db_connection() as conn:
-                        c = conn.cursor()
-                        c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
-                        conn.commit()
-                all_pages.append((current_url, content))
-                logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
-                if depth < max_depth:
-                    links = await get_links(session, current_url, base_url)
-                    for link in links:
-                        if link not in visited:
-                            to_visit.append((link, depth + 1))
-            except Exception as e:
-                logger.error(f"Error processing {current_url}: {str(e)}")
-                # Continue with the next URL even if this one fails
-    return all_pages
-def generate_pdf_chunk(chunk, output_file):
-    pdf = FPDF()
-    pdf.set_auto_page_break(auto=True, margin=15)
-    pdf.add_page()
-    pdf.set_font("Arial", size=12)
-    for page_url, content in chunk:
-        pdf.cell(0, 10, txt=page_url, ln=True)
-        pdf.ln(5)
-        for text in content:
-            try:
-                pdf.multi_cell(0, 10, txt=text[:200])  # Limit text length to avoid issues
-            except Exception as e:
-                logger.error(f"Error writing text to PDF: {str(e)}")
-        if pdf.get_y() > 250:  # Add a new page if the current page is almost full
-            pdf.add_page()
-    pdf.output(output_file)
-def website_to_pdf(all_pages, progress_callback):
-    logger.info(f"Starting PDF generation for {len(all_pages)} pages")
-    chunk_size = 100
-    total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
-    temp_files = []
-    with tempfile.TemporaryDirectory() as temp_dir:
-        for i in range(0, len(all_pages), chunk_size):
-            chunk = all_pages[i:i+chunk_size]
-            temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
-            generate_pdf_chunk(chunk, temp_file)
-            temp_files.append(temp_file)
-            progress = min((i + chunk_size) / len(all_pages), 1.0)
-            progress_callback(f"Processing pages... {progress:.0%}")
-            logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
-        logger.info("Mer

 from dash import dcc, html, Input, Output, State
 import dash_bootstrap_components as dbc
 from dash.exceptions import PreventUpdate
+import google.generativeai as genai
+from github import Github
+import gitlab
 import requests
 import tempfile
+import docx
+import os
+import logging
+import threading
+from huggingface_hub import HfApi
+from flask import send_file
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
 # Initialize Dash app
 app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+server = app.server  # Expose the Flask server
+# Hugging Face API setup
+hf_api = HfApi()
+# Get Hugging Face variables
+GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
+GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
+def is_ui_file(filename):
+    ui_extensions = ['.erb', '.haml', '.slim', '.php', '.aspx', '.jsp', '.ftl', '.twig', '.mustache', '.handlebars', '.ejs', '.pug', '.blade.php', '.xhtml', '.fxml', '.tsx', '.jsx', '.vue', '.html', '.cshtml', '.razor', '.xaml', '.jsx']
+    return any(filename.endswith(ext) for ext in ui_extensions)
+def get_file_contents(git_provider, repo_url, exclude_folders):
+    file_contents = []
+    logger.info(f"Fetching files from {git_provider} repository: {repo_url}")
+    exclude_folders = [folder.strip() for folder in exclude_folders.split(',') if folder.strip()]
+    if git_provider == "GitHub":
+        g = Github(GITHUB_TOKEN)
+        repo = g.get_repo(repo_url)
+        contents = repo.get_contents("")
+        while contents:
+            file_content = contents.pop(0)
+            if file_content.type == "dir":
+                if not any(file_content.path.startswith(folder) for folder in exclude_folders):
+                    contents.extend(repo.get_contents(file_content.path))
+            elif is_ui_file(file_content.name) and not any(file_content.path.startswith(folder) for folder in exclude_folders):
+                logger.info(f"Found UI file: {file_content.path}")
+                file_contents.append((file_content.path, file_content.decoded_content.decode('utf-8', errors='ignore')))
+    elif git_provider == "GitLab":
+        gl = gitlab.Gitlab(url='https://gitlab.com', private_token=GITHUB_TOKEN)
+        project = gl.projects.get(repo_url)
+        items = project.repository_tree(recursive=True)
+        for item in items:
+            if item['type'] == 'blob' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
+                logger.info(f"Found UI file: {item['path']}")
+                file_content = project.files.get(item['path'], ref='main')
+                file_contents.append((item['path'], file_content.decode().decode('utf-8', errors='ignore')))
+    elif git_provider == "Gitea":
+        base_url = "https://gitea.com/api/v1"
+        headers = {"Authorization": f"token {GITHUB_TOKEN}"}
+        def recursive_get_contents(path=""):
+            response = requests.get(f"{base_url}/repos/{repo_url}/contents/{path}", headers=headers)
+            response.raise_for_status()
+            for item in response.json():
+                if item['type'] == 'file' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
+                    logger.info(f"Found UI file: {item['path']}")
+                    file_content = requests.get(item['download_url']).text
+                    file_contents.append((item['path'], file_content))
+                elif item['type'] == 'dir' and not any(item['path'].startswith(folder) for folder in exclude_folders):
+                    recursive_get_contents(item['path'])
+        recursive_get_contents()
+    else:
+        raise ValueError("Unsupported Git provider")
+    logger.info(f"Total UI files found: {len(file_contents)}")
+    return file_contents
+def generate_guide_section(file_path, file_content, guide_type):
+    logger.info(f"Generating {guide_type} section for file: {file_path}")
+    genai.configure(api_key=GEMINI_API_KEY)
+    model = genai.GenerativeModel('gemini-2.0-flash-lite')
+    if guide_type == "User Guide":
+        prompt = f"""Based on the following UI-related code file, generate a section for a user guide:
+        File: {file_path}
+        Content:
+        {file_content}
+        Please focus on:
+        1. The specific features and functionality this UI component provides to the end users
+        2. Step-by-step instructions on how to use these features
+        3. Any user interactions or inputs required
+        4. Expected outcomes or results for the user
+        Important formatting instructions:
+        - The output should be in plain text no markdown for example do not use * or ** or # or ##.  Instead use numbers like 1., 2. for bullets
+        - Use clear section titles
+        - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
+        - Explain the purpose and benefit of each feature for non-technical users
+        - This is an end user manual, not a system administration manual so focus on the end user components
+        """
+    else:  # Administration Guide
+        prompt = f"""Based on the following UI-related code file, generate a section for an System guide:
+        File: {file_path}
+        Content:
+        {file_content}
+        Please focus on explaining what that component is and does:
+        1. Any configuration options or settings related to this UI component
+        2. Security considerations or access control related to this feature
+        3. How to monitor or troubleshoot issues with this component
+        4. Best practices for managing and maintaining this part of the system
+        Important formatting instructions:
+        - The output should be in plain text no markdown for example for example do not use * or ** or # or ##.  Instead use numbers like 1., 2. for bullets
+        - Use clear section titles
+        - Use clear section titles that has the name of the file in parenthesis
+        - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
+        - Explain the purpose and implications of each component
+        """
+    response = model.generate_content(prompt)
+    logger.info(f"Generated {guide_type} section for {file_path}")
+    return response.text
+def generate_guide(git_provider, repo_url, guide_type, exclude_folders):
     try:
+        logger.info(f"Starting guide generation for {repo_url}")
+        file_contents = get_file_contents(git_provider, repo_url, exclude_folders)
+        guide_sections = []
+        for file_path, content in file_contents:
+            section = generate_guide_section(file_path, content, guide_type)
+            guide_sections.append(section)
+            logger.info(f"Added section for {file_path}")
+        full_guide = f"# {guide_type}\n\n" + "\n\n".join(guide_sections)
+        logger.info("Creating DOCX file")
+        doc = docx.Document()
+        doc.add_heading(guide_type, 0)
+        for line in full_guide.split('\n'):
+            line = line.strip()
+            if line.startswith('# '):
+                doc.add_heading(line[2:], level=1)
+            elif line.startswith('## '):
+                doc.add_heading(line[3:], level=2)
+            elif line.startswith('Step'):
+                doc.add_paragraph(line, style='List Number')
+            else:
+                doc.add_paragraph(line)
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_docx:
+            doc.save(temp_docx.name)
+            docx_path = temp_docx.name
+        logger.info(f"DOCX file saved: {docx_path}")
+        logger.info("Creating Markdown file")
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8') as temp_md:
+            temp_md.write(full_guide)
+            md_path = temp_md.name
+        logger.info(f"Markdown file saved: {md_path}")
+        logger.info("Guide generation completed successfully")
+        return full_guide, docx_path, md_path
     except Exception as e:
+        logger.error(f"An error occurred: {str(e)}", exc_info=True)
+        return f"An error occurred: {str(e)}", None, None
+# App layout
+app.layout = dbc.Container([
+    dbc.Navbar(
+        dbc.Container([
+            html.A(
+                dbc.Row([
+                    dbc.Col(html.Img(src="/assets/logo.png", height="30px")),
+                    dbc.Col(dbc.NavbarBrand("Automated Guide Generator", className="ms-2")),
+                ],
+                align="center",
+                className="g-0",
+                ),
+                href="/",
+                style={"textDecoration": "none"},
+            )
+        ]),
+        color="primary",
+        dark=True,
+    ),
+    dbc.Row([
+        dbc.Col([
+            html.H1("Automated Guide Generator", className="text-center my-4"),
+            html.P("Generate a user guide or administration guide based on the UI-related code in a Git repository using Gemini AI. Select a Git provider, enter repository details, choose the guide type, and let AI create a comprehensive guide.", className="text-center mb-4"),
+            dbc.Card([
+                dbc.CardBody([
+                    dbc.Form([
+                        dbc.Select(
+                            id="git-provider",
+                            options=[
+                                {"label": "GitHub", "value": "GitHub"},
+                                {"label": "GitLab", "value": "GitLab"},
+                                {"label": "Gitea", "value": "Gitea"}
+                            ],
+                            placeholder="Select Git Provider",
+                        ),
+                        dbc.Input(id="repo-url", type="text", placeholder="Repository URL (owner/repo)"),
+                        dbc.RadioItems(
+                            id="guide-type",
+                            options=[
+                                {"label": "User Guide", "value": "User Guide"},
+                                {"label": "Administration Guide", "value": "Administration Guide"}
+                            ],
+                            inline=True,
+                        ),
+                        dbc.Input(id="exclude-folders", type="text", placeholder="Exclude Folders (comma-separated)"),
+                        dbc.Button("Generate Guide", id="generate-button", color="primary", className="mt-3"),
+                    ])
+                ])
+            ], className="mb-4"),
+            dbc.Spinner(
+                dbc.Card([
+                    dbc.CardBody([
+                        html.H4("Generated Guide", className="card-title"),
+                        html.Div([
+                            dbc.Button("Download DOCX", id="download-docx", color="secondary", className="me-2"),
+                            dbc.Button("Download Markdown", id="download-md", color="secondary"),
+                        ], className="mt-3"),
+                        dcc.Download(id="download-docx-file"),
+                        dcc.Download(id="download-md-file"),
+                    ])
+                ], className="mt-4"),
+                color="primary",
+            ),
+        ], width=6),
+        dbc.Col([
+            dbc.Card([
+                dbc.CardBody([
+                    html.H4("Preview", className="card-title"),
+                    html.Div(id="generated-guide", style={"whiteSpace": "pre-wrap", "height": "400px", "overflowY": "auto"}),
+                ])
+            ], className="mt-4"),
+        ], width=6),
+    ])
+], fluid=True)
+@app.callback(
+    [Output("generated-guide", "children"),
+     Output("download-docx", "n_clicks"),
+     Output("download-md", "n_clicks")],
+    [Input("generate-button", "n_clicks")],
+    [State("git-provider", "value"),
+     State("repo-url", "value"),
+     State("guide-type", "value"),
+     State("exclude-folders", "value")]
+)
+def update_output(n_clicks, git_provider, repo_url, guide_type, exclude_folders):
+    if n_clicks is None:
+        raise PreventUpdate
+    def generate_guide_thread():
+        nonlocal guide_text, docx_path, md_path
+        guide_text, docx_path, md_path = generate_guide(git_provider, repo_url, guide_type, exclude_folders)
+    guide_text, docx_path, md_path = None, None, None
+    thread = threading.Thread(target=generate_guide_thread)
+    thread.start()
+    thread.join()
+    return guide_text, 0, 0  # Reset n_clicks for download buttons
+@app.callback(
+    Output("download-docx-file", "data"),
+    Input("download-docx", "n_clicks"),
+    prevent_initial_call=True,
+)
+def download_docx(n_clicks):
+    if n_clicks is None:
+        raise PreventUpdate
+    return dcc.send_file(docx_path, filename="generated_guide.docx")
+@app.callback(
+    Output("download-md-file", "data"),
+    Input("download-md", "n_clicks"),
+    prevent_initial_call=True,
+)
+def download_md(n_clicks):
+    if n_clicks is None:
+        raise PreventUpdate
+    return dcc.send_file(md_path, filename="generated_guide.md")
+if __name__ == '__main__':
+    print("Starting the Dash application...")
+    app.run(debug=True, host='0.0.0.0', port=7860)
+    print("Dash application has finished running.")