bluenevus commited on
Commit
459429d
·
verified ·
1 Parent(s): e82072d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +281 -177
app.py CHANGED
@@ -2,197 +2,301 @@ import dash
2
  from dash import dcc, html, Input, Output, State
3
  import dash_bootstrap_components as dbc
4
  from dash.exceptions import PreventUpdate
5
- import base64
 
 
6
  import requests
7
- from bs4 import BeautifulSoup
8
- from urllib.parse import urljoin, urlparse
9
- from fpdf import FPDF
10
- import re
11
- import logging
12
- import asyncio
13
- import aiohttp
14
- from aiolimiter import AsyncLimiter
15
- import sqlite3
16
- from contextlib import contextmanager
17
- from threading import local
18
- import time
19
- import os
20
- import ssl
21
- from io import BytesIO
22
  import tempfile
23
- import uuid
24
- from concurrent.futures import ThreadPoolExecutor
25
- from PyPDF2 import PdfMerger
 
 
 
 
 
 
 
26
 
27
  # Initialize Dash app
28
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
29
- server = app.server
30
 
31
- # Logging setup
32
- logging.basicConfig(level=logging.INFO)
33
- logger = logging.getLogger(__name__)
34
 
35
- # Thread-local storage for database connections
36
- thread_local = local()
 
37
 
38
- # Rate limiter: 10 requests per second
39
- rate_limiter = AsyncLimiter(10, 1)
 
40
 
41
- # Create an SSL context that ignores certificate verification
42
- ssl_context = ssl.create_default_context()
43
- ssl_context.check_hostname = False
44
- ssl_context.verify_mode = ssl.CERT_NONE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # ThreadPoolExecutor for background tasks
47
- executor = ThreadPoolExecutor(max_workers=4)
 
 
 
 
 
48
 
49
- @contextmanager
50
- def get_db_connection():
51
- if not hasattr(thread_local, "connection"):
52
- thread_local.connection = sqlite3.connect('crawl_cache.db')
53
- try:
54
- yield thread_local.connection
55
- finally:
56
- pass # We'll keep the connection open for reuse
57
-
58
- def init_db():
59
- with get_db_connection() as conn:
60
- c = conn.cursor()
61
- c.execute('''CREATE TABLE IF NOT EXISTS pages
62
- (url TEXT PRIMARY KEY, content TEXT, depth INTEGER)''')
63
- c.execute('''CREATE INDEX IF NOT EXISTS idx_url ON pages(url)''')
64
- conn.commit()
65
-
66
- init_db()
67
-
68
- def clean_text(text):
69
- text = ''.join(char for char in text if char.isprintable())
70
- text = re.sub(r'[^\x00-\x7F]+', ' ', text)
71
- return text
72
-
73
- async def get_page_content(session, url):
74
- try:
75
- async with rate_limiter:
76
- async with session.get(url, timeout=30) as response:
77
- if response.status == 200:
78
- text = await response.text()
79
- soup = BeautifulSoup(text, 'html.parser')
80
- content = []
81
- main_content = soup.find('article') or soup.find('main') or soup
82
- if main_content:
83
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
84
- for element in main_content.find_all(tag):
85
- text = clean_text(element.get_text(strip=True))
86
- if text:
87
- content.append(text)
88
- logger.info(f"Found {len(content)} content items for {url}")
89
- return content
90
- else:
91
- logger.error(f"Error fetching {url}: HTTP {response.status}")
92
- return [f"Error fetching {url}: HTTP {response.status}"]
93
- except Exception as e:
94
- logger.error(f"Error processing {url}: {str(e)}")
95
- return [f"Error processing {url}: {str(e)}"]
96
 
97
- async def get_links(session, url, base_url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
- async with rate_limiter:
100
- async with session.get(url, timeout=30) as response:
101
- if response.status == 200:
102
- text = await response.text()
103
- soup = BeautifulSoup(text, 'html.parser')
104
- links = soup.find_all('a', href=True)
105
- valid_links = []
106
- for link in links:
107
- full_url = urljoin(url, link['href'])
108
- if full_url.startswith(base_url) and full_url != url:
109
- valid_links.append(full_url)
110
- return valid_links
111
- else:
112
- logger.error(f"Error fetching links from {url}: HTTP {response.status}")
113
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  except Exception as e:
115
- logger.error(f"Error getting links from {url}: {str(e)}")
116
- return []
117
-
118
- async def crawl_pages(base_url, max_depth):
119
- visited = set()
120
- to_visit = [(base_url, 0)]
121
- all_pages = []
122
-
123
- async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=ssl_context)) as session:
124
- while to_visit:
125
- current_url, depth = to_visit.pop(0)
126
- if current_url in visited or depth > max_depth:
127
- continue
128
-
129
- visited.add(current_url)
130
- start_time = time.time()
131
-
132
- try:
133
- with get_db_connection() as conn:
134
- c = conn.cursor()
135
- c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
136
- result = c.fetchone()
137
-
138
- if result:
139
- content = eval(result[0]) # Convert string back to list
140
- else:
141
- content = await get_page_content(session, current_url)
142
- with get_db_connection() as conn:
143
- c = conn.cursor()
144
- c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
145
- conn.commit()
146
-
147
- all_pages.append((current_url, content))
148
- logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
149
-
150
- if depth < max_depth:
151
- links = await get_links(session, current_url, base_url)
152
- for link in links:
153
- if link not in visited:
154
- to_visit.append((link, depth + 1))
155
- except Exception as e:
156
- logger.error(f"Error processing {current_url}: {str(e)}")
157
- # Continue with the next URL even if this one fails
158
-
159
- return all_pages
160
-
161
- def generate_pdf_chunk(chunk, output_file):
162
- pdf = FPDF()
163
- pdf.set_auto_page_break(auto=True, margin=15)
164
- pdf.add_page()
165
- pdf.set_font("Arial", size=12)
166
-
167
- for page_url, content in chunk:
168
- pdf.cell(0, 10, txt=page_url, ln=True)
169
- pdf.ln(5)
170
- for text in content:
171
- try:
172
- pdf.multi_cell(0, 10, txt=text[:200]) # Limit text length to avoid issues
173
- except Exception as e:
174
- logger.error(f"Error writing text to PDF: {str(e)}")
175
- if pdf.get_y() > 250: # Add a new page if the current page is almost full
176
- pdf.add_page()
177
-
178
- pdf.output(output_file)
179
-
180
- def website_to_pdf(all_pages, progress_callback):
181
- logger.info(f"Starting PDF generation for {len(all_pages)} pages")
182
 
183
- chunk_size = 100
184
- total_chunks = (len(all_pages) + chunk_size - 1) // chunk_size
185
- temp_files = []
186
-
187
- with tempfile.TemporaryDirectory() as temp_dir:
188
- for i in range(0, len(all_pages), chunk_size):
189
- chunk = all_pages[i:i+chunk_size]
190
- temp_file = os.path.join(temp_dir, f"chunk_{i}.pdf")
191
- generate_pdf_chunk(chunk, temp_file)
192
- temp_files.append(temp_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- progress = min((i + chunk_size) / len(all_pages), 1.0)
195
- progress_callback(f"Processing pages... {progress:.0%}")
196
- logger.info(f"Generated PDF chunk {i//chunk_size + 1}/{total_chunks}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
- logger.info("Mer
 
 
 
 
2
  from dash import dcc, html, Input, Output, State
3
  import dash_bootstrap_components as dbc
4
  from dash.exceptions import PreventUpdate
5
+ import google.generativeai as genai
6
+ from github import Github
7
+ import gitlab
8
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import tempfile
10
+ import docx
11
+ import os
12
+ import logging
13
+ import threading
14
+ from huggingface_hub import HfApi
15
+ from flask import send_file
16
+
17
+ # Configure logging
18
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
19
+ logger = logging.getLogger(__name__)
20
 
21
  # Initialize Dash app
22
  app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
23
+ server = app.server # Expose the Flask server
24
 
25
+ # Hugging Face API setup
26
+ hf_api = HfApi()
 
27
 
28
+ # Get Hugging Face variables
29
+ GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
30
+ GEMINI_API_KEY = os.environ.get('GEMINI_API_KEY')
31
 
32
+ def is_ui_file(filename):
33
+ ui_extensions = ['.erb', '.haml', '.slim', '.php', '.aspx', '.jsp', '.ftl', '.twig', '.mustache', '.handlebars', '.ejs', '.pug', '.blade.php', '.xhtml', '.fxml', '.tsx', '.jsx', '.vue', '.html', '.cshtml', '.razor', '.xaml', '.jsx']
34
+ return any(filename.endswith(ext) for ext in ui_extensions)
35
 
36
+ def get_file_contents(git_provider, repo_url, exclude_folders):
37
+ file_contents = []
38
+ logger.info(f"Fetching files from {git_provider} repository: {repo_url}")
39
+ exclude_folders = [folder.strip() for folder in exclude_folders.split(',') if folder.strip()]
40
+ if git_provider == "GitHub":
41
+ g = Github(GITHUB_TOKEN)
42
+ repo = g.get_repo(repo_url)
43
+ contents = repo.get_contents("")
44
+ while contents:
45
+ file_content = contents.pop(0)
46
+ if file_content.type == "dir":
47
+ if not any(file_content.path.startswith(folder) for folder in exclude_folders):
48
+ contents.extend(repo.get_contents(file_content.path))
49
+ elif is_ui_file(file_content.name) and not any(file_content.path.startswith(folder) for folder in exclude_folders):
50
+ logger.info(f"Found UI file: {file_content.path}")
51
+ file_contents.append((file_content.path, file_content.decoded_content.decode('utf-8', errors='ignore')))
52
+ elif git_provider == "GitLab":
53
+ gl = gitlab.Gitlab(url='https://gitlab.com', private_token=GITHUB_TOKEN)
54
+ project = gl.projects.get(repo_url)
55
+ items = project.repository_tree(recursive=True)
56
+ for item in items:
57
+ if item['type'] == 'blob' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
58
+ logger.info(f"Found UI file: {item['path']}")
59
+ file_content = project.files.get(item['path'], ref='main')
60
+ file_contents.append((item['path'], file_content.decode().decode('utf-8', errors='ignore')))
61
+ elif git_provider == "Gitea":
62
+ base_url = "https://gitea.com/api/v1"
63
+ headers = {"Authorization": f"token {GITHUB_TOKEN}"}
64
+ def recursive_get_contents(path=""):
65
+ response = requests.get(f"{base_url}/repos/{repo_url}/contents/{path}", headers=headers)
66
+ response.raise_for_status()
67
+ for item in response.json():
68
+ if item['type'] == 'file' and is_ui_file(item['name']) and not any(item['path'].startswith(folder) for folder in exclude_folders):
69
+ logger.info(f"Found UI file: {item['path']}")
70
+ file_content = requests.get(item['download_url']).text
71
+ file_contents.append((item['path'], file_content))
72
+ elif item['type'] == 'dir' and not any(item['path'].startswith(folder) for folder in exclude_folders):
73
+ recursive_get_contents(item['path'])
74
+ recursive_get_contents()
75
+ else:
76
+ raise ValueError("Unsupported Git provider")
77
+ logger.info(f"Total UI files found: {len(file_contents)}")
78
+ return file_contents
79
 
80
+ def generate_guide_section(file_path, file_content, guide_type):
81
+ logger.info(f"Generating {guide_type} section for file: {file_path}")
82
+ genai.configure(api_key=GEMINI_API_KEY)
83
+ model = genai.GenerativeModel('gemini-2.0-flash-lite')
84
+
85
+ if guide_type == "User Guide":
86
+ prompt = f"""Based on the following UI-related code file, generate a section for a user guide:
87
 
88
+ File: {file_path}
89
+ Content:
90
+ {file_content}
91
+
92
+ Please focus on:
93
+ 1. The specific features and functionality this UI component provides to the end users
94
+ 2. Step-by-step instructions on how to use these features
95
+ 3. Any user interactions or inputs required
96
+ 4. Expected outcomes or results for the user
97
+
98
+ Important formatting instructions:
99
+ - The output should be in plain text no markdown for example do not use * or ** or # or ##. Instead use numbers like 1., 2. for bullets
100
+ - Use clear section titles
101
+ - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
102
+ - Explain the purpose and benefit of each feature for non-technical users
103
+ - This is an end user manual, not a system administration manual so focus on the end user components
104
+ """
105
+ else: # Administration Guide
106
+ prompt = f"""Based on the following UI-related code file, generate a section for an System guide:
107
+
108
+ File: {file_path}
109
+ Content:
110
+ {file_content}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ Please focus on explaining what that component is and does:
113
+ 1. Any configuration options or settings related to this UI component
114
+ 2. Security considerations or access control related to this feature
115
+ 3. How to monitor or troubleshoot issues with this component
116
+ 4. Best practices for managing and maintaining this part of the system
117
+
118
+ Important formatting instructions:
119
+ - The output should be in plain text no markdown for example for example do not use * or ** or # or ##. Instead use numbers like 1., 2. for bullets
120
+ - Use clear section titles
121
+ - Use clear section titles that has the name of the file in parenthesis
122
+ - Follow this numbering heirarchy (1.0, 1.1, 1.2), (2.0, 2.1, 2.2), (3.0, 3.1, 3.2)
123
+ - Explain the purpose and implications of each component
124
+ """
125
+
126
+ response = model.generate_content(prompt)
127
+ logger.info(f"Generated {guide_type} section for {file_path}")
128
+ return response.text
129
+
130
+ def generate_guide(git_provider, repo_url, guide_type, exclude_folders):
131
  try:
132
+ logger.info(f"Starting guide generation for {repo_url}")
133
+ file_contents = get_file_contents(git_provider, repo_url, exclude_folders)
134
+
135
+ guide_sections = []
136
+ for file_path, content in file_contents:
137
+ section = generate_guide_section(file_path, content, guide_type)
138
+ guide_sections.append(section)
139
+ logger.info(f"Added section for {file_path}")
140
+
141
+ full_guide = f"# {guide_type}\n\n" + "\n\n".join(guide_sections)
142
+
143
+ logger.info("Creating DOCX file")
144
+ doc = docx.Document()
145
+ doc.add_heading(guide_type, 0)
146
+
147
+ for line in full_guide.split('\n'):
148
+ line = line.strip()
149
+ if line.startswith('# '):
150
+ doc.add_heading(line[2:], level=1)
151
+ elif line.startswith('## '):
152
+ doc.add_heading(line[3:], level=2)
153
+ elif line.startswith('Step'):
154
+ doc.add_paragraph(line, style='List Number')
155
+ else:
156
+ doc.add_paragraph(line)
157
+
158
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_docx:
159
+ doc.save(temp_docx.name)
160
+ docx_path = temp_docx.name
161
+ logger.info(f"DOCX file saved: {docx_path}")
162
+
163
+ logger.info("Creating Markdown file")
164
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.md', mode='w', encoding='utf-8') as temp_md:
165
+ temp_md.write(full_guide)
166
+ md_path = temp_md.name
167
+ logger.info(f"Markdown file saved: {md_path}")
168
+
169
+ logger.info("Guide generation completed successfully")
170
+ return full_guide, docx_path, md_path
171
+
172
  except Exception as e:
173
+ logger.error(f"An error occurred: {str(e)}", exc_info=True)
174
+ return f"An error occurred: {str(e)}", None, None
175
+
176
+ # App layout
177
+ app.layout = dbc.Container([
178
+ dbc.Navbar(
179
+ dbc.Container([
180
+ html.A(
181
+ dbc.Row([
182
+ dbc.Col(html.Img(src="/assets/logo.png", height="30px")),
183
+ dbc.Col(dbc.NavbarBrand("Automated Guide Generator", className="ms-2")),
184
+ ],
185
+ align="center",
186
+ className="g-0",
187
+ ),
188
+ href="/",
189
+ style={"textDecoration": "none"},
190
+ )
191
+ ]),
192
+ color="primary",
193
+ dark=True,
194
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
+ dbc.Row([
197
+ dbc.Col([
198
+ html.H1("Automated Guide Generator", className="text-center my-4"),
199
+ html.P("Generate a user guide or administration guide based on the UI-related code in a Git repository using Gemini AI. Select a Git provider, enter repository details, choose the guide type, and let AI create a comprehensive guide.", className="text-center mb-4"),
200
+
201
+ dbc.Card([
202
+ dbc.CardBody([
203
+ dbc.Form([
204
+ dbc.Select(
205
+ id="git-provider",
206
+ options=[
207
+ {"label": "GitHub", "value": "GitHub"},
208
+ {"label": "GitLab", "value": "GitLab"},
209
+ {"label": "Gitea", "value": "Gitea"}
210
+ ],
211
+ placeholder="Select Git Provider",
212
+ ),
213
+ dbc.Input(id="repo-url", type="text", placeholder="Repository URL (owner/repo)"),
214
+ dbc.RadioItems(
215
+ id="guide-type",
216
+ options=[
217
+ {"label": "User Guide", "value": "User Guide"},
218
+ {"label": "Administration Guide", "value": "Administration Guide"}
219
+ ],
220
+ inline=True,
221
+ ),
222
+ dbc.Input(id="exclude-folders", type="text", placeholder="Exclude Folders (comma-separated)"),
223
+ dbc.Button("Generate Guide", id="generate-button", color="primary", className="mt-3"),
224
+ ])
225
+ ])
226
+ ], className="mb-4"),
227
 
228
+ dbc.Spinner(
229
+ dbc.Card([
230
+ dbc.CardBody([
231
+ html.H4("Generated Guide", className="card-title"),
232
+ html.Div([
233
+ dbc.Button("Download DOCX", id="download-docx", color="secondary", className="me-2"),
234
+ dbc.Button("Download Markdown", id="download-md", color="secondary"),
235
+ ], className="mt-3"),
236
+ dcc.Download(id="download-docx-file"),
237
+ dcc.Download(id="download-md-file"),
238
+ ])
239
+ ], className="mt-4"),
240
+ color="primary",
241
+ ),
242
+ ], width=6),
243
+ dbc.Col([
244
+ dbc.Card([
245
+ dbc.CardBody([
246
+ html.H4("Preview", className="card-title"),
247
+ html.Div(id="generated-guide", style={"whiteSpace": "pre-wrap", "height": "400px", "overflowY": "auto"}),
248
+ ])
249
+ ], className="mt-4"),
250
+ ], width=6),
251
+ ])
252
+ ], fluid=True)
253
+
254
+ @app.callback(
255
+ [Output("generated-guide", "children"),
256
+ Output("download-docx", "n_clicks"),
257
+ Output("download-md", "n_clicks")],
258
+ [Input("generate-button", "n_clicks")],
259
+ [State("git-provider", "value"),
260
+ State("repo-url", "value"),
261
+ State("guide-type", "value"),
262
+ State("exclude-folders", "value")]
263
+ )
264
+ def update_output(n_clicks, git_provider, repo_url, guide_type, exclude_folders):
265
+ if n_clicks is None:
266
+ raise PreventUpdate
267
+
268
+ def generate_guide_thread():
269
+ nonlocal guide_text, docx_path, md_path
270
+ guide_text, docx_path, md_path = generate_guide(git_provider, repo_url, guide_type, exclude_folders)
271
+
272
+ guide_text, docx_path, md_path = None, None, None
273
+ thread = threading.Thread(target=generate_guide_thread)
274
+ thread.start()
275
+ thread.join()
276
+
277
+ return guide_text, 0, 0 # Reset n_clicks for download buttons
278
+
279
+ @app.callback(
280
+ Output("download-docx-file", "data"),
281
+ Input("download-docx", "n_clicks"),
282
+ prevent_initial_call=True,
283
+ )
284
+ def download_docx(n_clicks):
285
+ if n_clicks is None:
286
+ raise PreventUpdate
287
+ return dcc.send_file(docx_path, filename="generated_guide.docx")
288
+
289
+ @app.callback(
290
+ Output("download-md-file", "data"),
291
+ Input("download-md", "n_clicks"),
292
+ prevent_initial_call=True,
293
+ )
294
+ def download_md(n_clicks):
295
+ if n_clicks is None:
296
+ raise PreventUpdate
297
+ return dcc.send_file(md_path, filename="generated_guide.md")
298
 
299
+ if __name__ == '__main__':
300
+ print("Starting the Dash application...")
301
+ app.run(debug=True, host='0.0.0.0', port=7860)
302
+ print("Dash application has finished running.")