acecalisto3 commited on
Commit
771baf7
Β·
verified Β·
1 Parent(s): 71b0a3f

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +813 -306
app2.py CHANGED
@@ -1,323 +1,830 @@
1
- import gradio as gr
2
- import requests
3
- import zipfile
4
- import uuid
5
- import bs4
6
- import lxml
7
- import os
8
- from huggingface_hub import InferenceClient, HfApi
9
- import random
10
  import json
11
- import datetime
12
- from pypdf import PdfReader
13
- from agent import (
14
- PREFIX,
15
- COMPRESS_DATA_PROMPT,
16
- COMPRESS_DATA_PROMPT_SMALL,
17
- LOG_PROMPT,
18
- LOG_RESPONSE,
19
- )
20
-
21
- # Initialize Hugging Face client
22
- client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
23
- reponame = "acecalisto3/tmp"
24
- save_data = f'https://huggingface.co/datasets/{reponame}/raw/main/'
25
-
26
- # Get HF token from environment or use demo mode
27
- token_self = os.environ.get('HF_TOKEN', 'dummy_token') # Use dummy token for demo
28
- if token_self == 'dummy_token':
29
- print("Warning: Running in demo mode without HuggingFace token. Some features may be limited.")
30
- api = HfApi(token=token_self)
31
-
32
- # Constants
33
- VERBOSE = True
34
- MAX_HISTORY = 100
35
- MAX_DATA = 20000
36
-
37
- def find_all(purpose, task, history, url, result, steps):
38
- return_list = []
39
- visited_links = set()
40
- links_to_visit = [(url, 0)]
41
-
42
- while links_to_visit:
43
- current_url, current_depth = links_to_visit.pop(0)
44
- if current_depth < steps:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  try:
46
- if current_url not in visited_links:
47
- visited_links.add(current_url)
48
- source = requests.get(current_url)
49
- if source.status_code == 200:
50
- soup = bs4.BeautifulSoup(source.content, 'lxml')
51
- rawp = f'RAW TEXT RETURNED: {soup.text}'
52
- return_list.append(rawp)
53
-
54
- for link in soup.find_all("a"):
55
- href = link.get('href')
56
- if href and href.startswith('http'):
57
- links_to_visit.append((href, current_depth + 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  except Exception as e:
59
- print(f"Error fetching {current_url}: {e}")
60
-
61
- return True, return_list
62
-
63
- def read_txt(txt_path):
64
- with open(txt_path, "r") as f:
65
- text = f.read()
66
- return text
67
-
68
- def read_pdf(pdf_path):
69
- text = ""
70
- reader = PdfReader(pdf_path)
71
- for page in reader.pages:
72
- text = f'{text}\n{page.extract_text()}'
73
- return text
74
-
75
- error_box = []
76
- def read_pdf_online(url):
77
- print(f"reading {url}")
78
- response = requests.get(url, stream=True)
79
- if response.status_code == 200:
80
- with open("test.pdf", "wb") as f:
81
- f.write(response.content)
82
- reader = PdfReader("test.pdf")
83
- text = ""
84
- for page in reader.pages:
85
- text = f'{text}\n{page.extract_text()}'
86
- return text
87
- else:
88
- error_box.append(url)
89
- return str(response.status_code)
90
-
91
- def format_prompt(message, history):
92
- prompt = "<s>"
93
- for user_prompt, bot_response in history:
94
- prompt += f"[INST] {user_prompt} [/INST]"
95
- prompt += f" {bot_response}</s> "
96
- prompt += f"[INST] {message} [/INST]"
97
- return prompt
98
-
99
- def run_gpt(prompt_template, stop_tokens, max_tokens, seed, **prompt_kwargs):
100
- timestamp = datetime.datetime.now()
101
-
102
- generate_kwargs = dict(
103
- temperature=0.9,
104
- max_new_tokens=max_tokens,
105
- top_p=0.95,
106
- repetition_penalty=1.0,
107
- do_sample=True,
108
- seed=seed,
109
- )
110
-
111
- content = PREFIX.format(
112
- timestamp=timestamp,
113
- purpose="Compile the provided data and complete the users task"
114
- ) + prompt_template.format(**prompt_kwargs)
115
-
116
- if VERBOSE:
117
- print(LOG_PROMPT.format(content))
118
-
119
- stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
120
- resp = ""
121
- for response in stream:
122
- resp += response.token.text
123
-
124
- if VERBOSE:
125
- print(LOG_RESPONSE.format(resp))
126
- return resp
127
-
128
- def compress_data(c, instruct, history):
129
- seed = random.randint(1, 1000000000)
130
- divr = int(c)/MAX_DATA
131
- divi = int(divr)+1 if divr != int(divr) else int(divr)
132
- chunk = int(int(c)/divr)
133
- out = []
134
- s = 0
135
- e = chunk
136
-
137
- for z in range(divi):
138
- hist = history[s:e]
139
- resp = run_gpt(
140
- COMPRESS_DATA_PROMPT_SMALL,
141
- stop_tokens=["observation:", "task:", "action:", "thought:"],
142
- max_tokens=8192,
143
- seed=seed,
144
- direction=instruct,
145
- knowledge="",
146
- history=hist,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  )
148
- out.append(resp)
149
- e = e+chunk
150
- s = s+chunk
151
- return out
152
-
153
- def create_zip_file(output_data, zip_name):
154
- with zipfile.ZipFile(zip_name, 'w') as zipf:
155
- for i, data in enumerate(output_data):
156
- zipf.writestr(f'data_{i}.txt', data)
157
- return zip_name
158
-
159
- def process_and_format_response(instructions, chat_history, report, summary_memory,
160
- input_data, uploaded_files, input_url, pdf_input_url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  try:
162
- # Process URL if provided
163
- if input_url:
164
- success, content = find_all("Extract content", "", [], input_url, "", 1)
165
- if success and content:
166
- processed_text = "\n".join(content)
167
- else:
168
- return "", [["Error", "Failed to fetch URL content"]], "URL processing failed", None
169
-
170
- # Process uploaded files
171
- elif uploaded_files:
172
- processed_text = ""
173
- for file in uploaded_files:
174
- if file.name.endswith('.pdf'):
175
- processed_text += read_pdf(file.name) + "\n\n"
176
- elif file.name.endswith('.txt'):
177
- processed_text += read_txt(file.name) + "\n\n"
178
-
179
- # Process direct text input
180
- elif input_data:
181
- processed_text = input_data
182
  else:
183
- return "", [["Error", "No input provided"]], "No input data", None
184
-
185
- # Generate summary using compress_data
186
- if processed_text:
187
- c = len(processed_text.split())
188
- summary = compress_data(c, instructions or "Summarize this text", processed_text)
189
-
190
- # Format the response
191
- if isinstance(summary, list):
192
- summary_text = "\n".join(summary)
 
 
 
 
193
  else:
194
- summary_text = str(summary)
195
-
196
- # Create chat messages
197
- messages = [
198
- ["Input", processed_text[:500] + "..."], # Show first 500 chars of input
199
- ["Summary", summary_text]
200
- ]
201
-
202
- # Create JSON output
203
- json_output = {
204
- "input_length": len(processed_text),
205
- "summary_length": len(summary_text),
206
- "summary": summary_text
207
- }
 
208
 
209
- return "", messages, "Processing completed successfully", json_output
 
210
 
211
- except Exception as e:
212
- error_msg = f"Error: {str(e)}"
213
- return "", [["Error", error_msg]], error_msg, None
214
-
215
- def clear_fn():
216
- return "", []
217
-
218
- # Create Gradio interface
219
- with gr.Blocks() as app:
220
- gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3></center>""")
221
-
222
- # Main chat interface
223
- with gr.Row():
224
- chatbot = gr.Chatbot(
225
- label="Mixtral 8x7B Chatbot",
226
- show_copy_button=True,
227
- height=400
228
- )
229
-
230
- # Control Panel
231
- with gr.Row():
232
- with gr.Column(scale=3):
233
- prompt = gr.Textbox(
234
- label="Instructions",
235
- placeholder="Enter processing instructions here..."
236
- )
237
- steps = gr.Slider(
238
- label="Crawl Steps",
239
- minimum=1,
240
- maximum=5,
241
- value=1,
242
- info="Number of levels to crawl for web content"
243
- )
244
- with gr.Column(scale=1):
245
- report_check = gr.Checkbox(
246
- label="Return Report",
247
- value=True,
248
- info="Generate detailed analysis report"
249
- )
250
- sum_mem_check = gr.Radio(
251
- label="Output Type",
252
- choices=["Summary", "Memory"],
253
- value="Summary",
254
- info="Choose between summarized or memory-based output"
255
- )
256
- process_btn = gr.Button("Process", variant="primary")
257
-
258
- # Input Tabs
259
- with gr.Tabs() as input_tabs:
260
- with gr.Tab("πŸ“ Text"):
261
- text_input = gr.Textbox(
262
- label="Input Text",
263
- lines=6,
264
- placeholder="Paste your text here..."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
  )
266
- with gr.Tab("πŸ“ File"):
267
  file_input = gr.File(
268
  label="Upload Files",
269
- file_types=[".pdf", ".txt"],
270
  file_count="multiple"
271
  )
272
- with gr.Tab("🌐 Web URL"):
273
- url_input = gr.Textbox(
274
- label="Website URL",
275
- placeholder="https://example.com"
276
- )
277
- with gr.Tab("πŸ“„ PDF URL"):
278
- pdf_url_input = gr.Textbox(
279
- label="PDF URL",
280
- placeholder="https://example.com/document.pdf"
281
  )
282
-
283
- # Output Section
284
- with gr.Row():
285
- with gr.Column():
286
- json_output = gr.JSON(
287
- label="Structured Output",
288
- show_label=True
 
289
  )
290
- with gr.Column():
291
- error_output = gr.Textbox(
292
- label="Status & Errors",
293
- interactive=False
294
  )
295
-
296
- # Event handlers
297
- process_btn.click(
298
- process_and_format_response,
299
- inputs=[
300
- prompt,
301
- chatbot,
302
- report_check,
303
- sum_mem_check,
304
- text_input,
305
- file_input,
306
- url_input,
307
- pdf_url_input
308
- ],
309
- outputs=[
310
- prompt,
311
- chatbot,
312
- error_output,
313
- json_output
314
- ]
315
- )
316
-
317
- # Launch the app
318
- app.queue(default_concurrency_limit=20).launch(
319
- show_api=False,
320
- share=False,
321
- server_name="0.0.0.0",
322
- server_port=8000
323
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import json
2
+ import os
3
+ import re
4
+ import time
5
+ import logging
6
+ import mimetypes
7
+ import zipfile
8
+ import tempfile
9
+ import chardet
10
+ from datetime import datetime
11
+ from typing import List, Dict, Optional, Union, Tuple
12
+ from pathlib import Path
13
+ from urllib.parse import urlparse, urljoin
14
+ import requests
15
+ import validators
16
+ import gradio as gr
17
+ from diskcache import Cache
18
+ from bs4 import BeautifulSoup
19
+ from fake_useragent import UserAgent
20
+ from cleantext import clean
21
+ import qrcode
22
+ from PIL import Image, ImageDraw, ImageFont
23
+ import numpy as np
24
+ import tarfile
25
+ import gzip
26
+ import math
27
+
28
+ # Setup enhanced logging with more detailed formatting
29
+ logging.basicConfig(
30
+ level=logging.INFO,
31
+ format='%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s',
32
+ handlers=[
33
+ logging.StreamHandler(),
34
+ logging.FileHandler('app.log', encoding='utf-8')
35
+ ])
36
+ logger = logging.getLogger(__name__)
37
+
38
+ # Ensure output directories exist with modern structure
39
+ OUTPUTS_DIR = Path('output')
40
+ QR_CODES_DIR = OUTPUTS_DIR / 'qr_codes'
41
+ TEMP_DIR = OUTPUTS_DIR / 'temp'
42
+ for directory in [OUTPUTS_DIR, QR_CODES_DIR, TEMP_DIR]:
43
+ directory.mkdir(parents=True, exist_ok=True)
44
+
45
+ class EnhancedURLProcessor:
46
+ """Advanced URL processing with complete content extraction"""
47
+ def __init__(self):
48
+ self.session = requests.Session()
49
+ self.timeout = 15 # Extended timeout for larger content
50
+ self.max_retries = 3
51
+ self.user_agent = UserAgent()
52
+
53
+ # Enhanced headers for better site compatibility
54
+ self.session.headers.update({
55
+ 'User-Agent': self.user_agent.random,
56
+ 'Accept': '*/*', # Accept all content types
57
+ 'Accept-Language': 'en-US,en;q=0.9',
58
+ 'Accept-Encoding': 'gzip, deflate, br',
59
+ 'Connection': 'keep-alive',
60
+ 'Upgrade-Insecure-Requests': '1',
61
+ 'Sec-Fetch-Dest': 'document',
62
+ 'Sec-Fetch-Mode': 'navigate',
63
+ 'Sec-Fetch-Site': 'none',
64
+ 'Sec-Fetch-User': '?1',
65
+ 'DNT': '1'
66
+ })
67
+
68
+ def validate_url(self, url: str) -> Dict:
69
+ """Enhanced URL validation with detailed feedback"""
70
+ try:
71
+ if not validators.url(url):
72
+ return {'is_valid': False, 'message': 'Invalid URL format', 'details': 'URL must begin with http:// or https://'}
73
+ parsed = urlparse(url)
74
+ if not all([parsed.scheme, parsed.netloc]):
75
+ return {'is_valid': False, 'message': 'Incomplete URL', 'details': 'Missing scheme or domain'}
76
+ # Try HEAD request first to check accessibility
77
+ try:
78
+ head_response = self.session.head(url, timeout=5)
79
+ head_response.raise_for_status()
80
+ except requests.exceptions.RequestException:
81
+ # If HEAD fails, try GET as some servers don't support HEAD
82
+ response = self.session.get(url, timeout=self.timeout)
83
+ response.raise_for_status()
84
+
85
+ return {
86
+ 'is_valid': True,
87
+ 'message': 'URL is valid and accessible',
88
+ 'details': {
89
+ 'content_type': head_response.headers.get('Content-Type', 'unknown'),
90
+ 'server': head_response.headers.get('Server', 'unknown'),
91
+ 'size': head_response.headers.get('Content-Length', 'unknown')
92
+ }
93
+ }
94
+ except Exception as e:
95
+ return {'is_valid': False, 'message': f'URL validation failed: {str(e)}', 'details': str(e)}
96
+
97
+ def fetch_content(self, url: str, retry_count: int = 0) -> Optional[Dict]:
98
+ """Enhanced content fetcher with retry mechanism and complete character extraction"""
99
+ try:
100
+ logger.info(f"Fetching content from URL: {url} (Attempt {retry_count + 1}/{self.max_retries})")
101
+
102
+ # Update User-Agent randomly for each request
103
+ self.session.headers.update({'User-Agent': self.user_agent.random})
104
+
105
+ response = self.session.get(url, timeout=self.timeout)
106
+ response.raise_for_status()
107
+
108
+ # Detect encoding
109
+ if response.encoding is None:
110
+ encoding = chardet.detect(response.content)['encoding'] or 'utf-8'
111
+ else:
112
+ encoding = response.encoding
113
+ # Decode content with fallback
114
+ try:
115
+ raw_content = response.content.decode(encoding, errors='replace')
116
+ except (UnicodeDecodeError, LookupError):
117
+ raw_content = response.content.decode('utf-8', errors='replace')
118
+
119
+ # Extract metadata
120
+ metadata = {
121
+ 'url': url,
122
+ 'timestamp': datetime.now().isoformat(),
123
+ 'encoding': encoding,
124
+ 'content_type': response.headers.get('Content-Type', ''),
125
+ 'content_length': len(response.content),
126
+ 'headers': dict(response.headers),
127
+ 'status_code': response.status_code
128
+ }
129
+
130
+ # Process based on content type
131
+ content_type = response.headers.get('Content-Type', '').lower()
132
+ if 'text/html' in content_type:
133
+ processed_content = self._process_html_content(raw_content, url)
134
+ else:
135
+ processed_content = raw_content
136
+ return {
137
+ 'content': processed_content,
138
+ 'raw_content': raw_content,
139
+ 'metadata': metadata
140
+ }
141
+ except requests.exceptions.RequestException as e:
142
+ if retry_count < self.max_retries - 1:
143
+ logger.warning(f"Retry {retry_count + 1}/{self.max_retries} for URL: {url}")
144
+ time.sleep(2 ** retry_count) # Exponential backoff
145
+ return self.fetch_content(url, retry_count + 1)
146
+ logger.error(f"Failed to fetch content after {self.max_retries} attempts: {e}")
147
+ return None
148
+ except Exception as e:
149
+ logger.error(f"Unexpected error while fetching content: {e}")
150
+ return None
151
+
152
+ def _process_html_content(self, content: str, base_url: str) -> str:
153
+ """Process HTML content while preserving all characters"""
154
+ try:
155
+ soup = BeautifulSoup(content, 'html.parser')
156
+
157
+ # Convert relative URLs to absolute
158
+ for tag in soup.find_all(['a', 'img', 'link', 'script']):
159
+ for attr in ['href', 'src']:
160
+ if tag.get(attr):
161
+ try:
162
+ tag[attr] = urljoin(base_url, tag[attr])
163
+ except Exception:
164
+ pass
165
+ # Extract all text content
166
+ text_parts = []
167
+ for element in soup.stripped_strings:
168
+ text_parts.append(str(element))
169
+ return '\n'.join(text_parts)
170
+ except Exception as e:
171
+ logger.error(f"HTML processing error: {e}")
172
+ return content
173
+
174
+ class EnhancedFileProcessor:
175
+ """Advanced file processing with complete content extraction"""
176
+ def __init__(self, max_file_size: int = 5 * 1024 * 1024 * 1024): # 5GB default
177
+ self.max_file_size = max_file_size
178
+ self.supported_extensions = {
179
+ '.txt', '.md', '.csv', '.json', '.xml', '.html', '.htm',
180
+ '.log', '.yml', '.yaml', '.ini', '.conf', '.cfg',
181
+ '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar',
182
+ '.pdf', '.doc', '.docx', '.rtf', '.odt'
183
+ }
184
+
185
+ def process_file(self, file) -> List[Dict]:
186
+ """Process uploaded file with enhanced error handling and complete extraction"""
187
+ if not file:
188
+ return []
189
+
190
+ dataset = []
191
+ try:
192
+ file_size = os.path.getsize(file.name)
193
+ if file_size > self.max_file_size:
194
+ logger.warning(f"File size ({file_size} bytes) exceeds maximum allowed size")
195
+ return []
196
+
197
+ with tempfile.TemporaryDirectory() as temp_dir:
198
+ temp_dir_path = Path(temp_dir)
199
+
200
+ # Handle different archive types
201
+ if self._is_archive(file.name):
202
+ dataset.extend(self._process_archive(file.name, temp_dir_path))
203
+ elif Path(file.name).suffix.lower() in self.supported_extensions:
204
+ dataset.extend(self._process_single_file(file))
205
+ else:
206
+ logger.warning(f"Unsupported file type: {file.name}")
207
+
208
+ except Exception as e:
209
+ logger.error(f"Error processing file: {str(e)}")
210
+ return []
211
+ return dataset
212
+
213
+ def _is_archive(self, filepath: str) -> bool:
214
+ """Check if file is an archive"""
215
+ return any(filepath.lower().endswith(ext) for ext in [
216
+ '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar'
217
+ ])
218
+
219
+ def _process_single_file(self, file) -> List[Dict]:
220
+ """Process a single file with enhanced character extraction and JSON handling"""
221
+ try:
222
+ file_stat = os.stat(file.name)
223
+ file_size = file_stat.st_size
224
+
225
+ # Initialize content storage
226
+ content_parts = []
227
+
228
+ # Process file in chunks for large files
229
+ chunk_size = 10 * 1024 * 1024 # 10MB chunks
230
+ with open(file.name, 'rb') as f:
231
+ while True:
232
+ chunk = f.read(chunk_size)
233
+ if not chunk:
234
+ break
235
+
236
+ # Detect encoding for each chunk
237
+ encoding = chardet.detect(chunk)['encoding'] or 'utf-8'
238
+ try:
239
+ decoded_chunk = chunk.decode(encoding, errors='replace')
240
+ content_parts.append(decoded_chunk)
241
+ except (UnicodeDecodeError, LookupError):
242
+ decoded_chunk = chunk.decode('utf-8', errors='replace')
243
+ content_parts.append(decoded_chunk)
244
+
245
+ # Combine all chunks
246
+ complete_content = ''.join(content_parts)
247
+
248
+ # Check if the content is valid JSON regardless of file extension
249
  try:
250
+ if mimetypes.guess_type(file.name)[0] == 'application/json' or file.name.lower().endswith('.json'):
251
+ # It's a JSON file by type or extension
252
+ json_data = json.loads(complete_content)
253
+ return [{
254
+ 'source': 'json_file',
255
+ 'filename': os.path.basename(file.name),
256
+ 'file_size': file_size,
257
+ 'mime_type': 'application/json',
258
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
259
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
260
+ 'content': json_data, # Store the parsed JSON object
261
+ 'raw_content': complete_content, # Store the original JSON string
262
+ 'timestamp': datetime.now().isoformat()
263
+ }]
264
+ else:
265
+ # Try to parse as JSON anyway
266
+ try:
267
+ json_data = json.loads(complete_content)
268
+ # If we get here, it's valid JSON despite the extension
269
+ return [{
270
+ 'source': 'json_content',
271
+ 'filename': os.path.basename(file.name),
272
+ 'file_size': file_size,
273
+ 'mime_type': 'application/json',
274
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
275
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
276
+ 'content': json_data, # Store the parsed JSON object
277
+ 'raw_content': complete_content, # Store the original JSON string
278
+ 'timestamp': datetime.now().isoformat()
279
+ }]
280
+ except json.JSONDecodeError:
281
+ logger.warning(f"File {file.name} is not valid JSON.")
282
  except Exception as e:
283
+ logger.error(f"Error during JSON processing: {e}")
284
+
285
+ return [{
286
+ 'source': 'file',
287
+ 'filename': os.path.basename(file.name),
288
+ 'file_size': file_size,
289
+ 'mime_type': mimetypes.guess_type(file.name)[0],
290
+ 'created': datetime.fromtimestamp(file_stat.st_ctime).isoformat(),
291
+ 'modified': datetime.fromtimestamp(file_stat.st_mtime).isoformat(),
292
+ 'content': complete_content,
293
+ 'timestamp': datetime.now().isoformat()
294
+ }]
295
+ except Exception as e:
296
+ logger.error(f"File processing error: {e}")
297
+ return []
298
+
299
+ def _process_archive(self, archive_path: str, extract_to: Path) -> List[Dict]:
300
+ """Process an archive file with enhanced extraction"""
301
+ dataset = []
302
+ try:
303
+ # Handle ZIP archives
304
+ if zipfile.is_zipfile(archive_path):
305
+ with zipfile.ZipFile(archive_path, 'r') as zip_ref:
306
+ zip_ref.extractall(extract_to)
307
+ for file_info in zip_ref.infolist():
308
+ if file_info.file_size > 0 and not file_info.filename.endswith('/'):
309
+ extracted_path = extract_to / file_info.filename
310
+ if extracted_path.suffix.lower() in self.supported_extensions:
311
+ with open(extracted_path, 'rb') as f:
312
+ dataset.extend(self._process_single_file(f))
313
+ # Handle TAR archives
314
+ elif archive_path.lower().endswith(('.tar', '.tar.gz', '.tgz')):
315
+ try:
316
+ with tarfile.open(archive_path, 'r:*') as tar_ref:
317
+ for member in tar_ref.getmembers():
318
+ if member.isfile():
319
+ extracted_path = extract_to / member.name
320
+ tar_ref.extract(member, path=extract_to)
321
+ if extracted_path.suffix.lower() in self.supported_extensions:
322
+ with open(extracted_path, 'rb') as f:
323
+ dataset.extend(self._process_single_file(f))
324
+ except tarfile.TarError as e:
325
+ logger.error(f"Error processing TAR archive: {e}")
326
+ # Handle GZIP archives (single file)
327
+ elif archive_path.lower().endswith('.gz'):
328
+ extracted_path = extract_to / Path(archive_path).stem
329
+ try:
330
+ with gzip.open(archive_path, 'rb') as gz_file, open(extracted_path, 'wb') as outfile:
331
+ outfile.write(gz_file.read())
332
+ if extracted_path.suffix.lower() in self.supported_extensions:
333
+ with open(extracted_path, 'rb') as f:
334
+ dataset.extend(self._process_single_file(f))
335
+ except gzip.GzipFile as e:
336
+ logger.error(f"Error processing GZIP archive: {e}")
337
+ # TODO: Add support for other archive types (.bz2, .7z, .rar) - may require external libraries
338
+ elif archive_path.lower().endswith(('.bz2', '.7z', '.rar')):
339
+ logger.warning(f"Support for {Path(archive_path).suffix} archives is not yet fully implemented.")
340
+
341
+ except Exception as e:
342
+ logger.error(f"Archive processing error: {e}")
343
+ return dataset
344
+
345
+ def chunk_data(self, data: Union[Dict, List], max_size: int = 2953) -> List[Dict]:
346
+ """Enhanced data chunking with sequence metadata"""
347
+ try:
348
+ # Convert data to JSON string
349
+ json_str = json.dumps(data, ensure_ascii=False)
350
+ total_length = len(json_str)
351
+
352
+ # Calculate overhead for metadata
353
+ metadata_template = {
354
+ "chunk_index": 0,
355
+ "total_chunks": 1,
356
+ "total_length": total_length,
357
+ "chunk_hash": "",
358
+ "data": ""
359
+ }
360
+ overhead = len(json.dumps(metadata_template)) + 20 # Extra padding for safety
361
+
362
+ # Calculate effective chunk size
363
+ effective_chunk_size = max_size - overhead
364
+
365
+ if total_length <= effective_chunk_size:
366
+ # Data fits in one chunk
367
+ chunk = {
368
+ "chunk_index": 0,
369
+ "total_chunks": 1,
370
+ "total_length": total_length,
371
+ "chunk_hash": hash(json_str) & 0xFFFFFFFF, # 32-bit hash
372
+ "data": json_str
373
+ }
374
+ return [chunk]
375
+
376
+ # Calculate number of chunks needed
377
+ num_chunks = -(-total_length // effective_chunk_size) # Ceiling division
378
+ chunk_size = -(-total_length // num_chunks) # Even distribution
379
+
380
+ chunks = []
381
+ for i in range(num_chunks):
382
+ start_idx = i * chunk_size
383
+ end_idx = min(start_idx + chunk_size, total_length)
384
+ chunk_data = json_str[start_idx:end_idx]
385
+
386
+ chunk = {
387
+ "chunk_index": i,
388
+ "total_chunks": num_chunks,
389
+ "total_length": total_length,
390
+ "chunk_hash": hash(chunk_data) & 0xFFFFFFFF,
391
+ "data": chunk_data
392
+ }
393
+ chunks.append(chunk)
394
+
395
+ return chunks
396
+ except Exception as e:
397
+ logger.error(f"Error chunking data: {e}")
398
+ return []
399
+
400
+ def generate_stylish_qr(data: Union[str, Dict],
401
+ filename: str,
402
+ size: int = 10,
403
+ border: int = 4,
404
+ fill_color: str = "#000000",
405
+ back_color: str = "#FFFFFF") -> str:
406
+ """Generate a stylish QR code with enhanced visual appeal"""
407
+ try:
408
+ qr = qrcode.QRCode(
409
+ version=None,
410
+ error_correction=qrcode.constants.ERROR_CORRECT_H,
411
+ box_size=size,
412
+ border=border
413
  )
414
+
415
+ # Add data to QR code
416
+ if isinstance(data, dict):
417
+ qr.add_data(json.dumps(data, ensure_ascii=False))
418
+ else:
419
+ qr.add_data(data)
420
+
421
+ qr.make(fit=True)
422
+
423
+ # Create QR code image with custom colors
424
+ qr_image = qr.make_image(fill_color=fill_color, back_color=back_color)
425
+
426
+ # Convert to RGBA for transparency support
427
+ qr_image = qr_image.convert('RGBA')
428
+
429
+ # Add subtle gradient overlay
430
+ gradient = Image.new('RGBA', qr_image.size, (0, 0, 0, 0))
431
+ draw = ImageDraw.Draw(gradient)
432
+ for i in range(qr_image.width):
433
+ alpha = int(255 * (1 - i/qr_image.width) * 0.1) # 10% maximum opacity
434
+ draw.line([(i, 0), (i, qr_image.height)], fill=(255, 255, 255, alpha))
435
+
436
+ # Combine images
437
+ final_image = Image.alpha_composite(qr_image, gradient)
438
+
439
+ # Save the image
440
+ output_path = QR_CODES_DIR / filename
441
+ final_image.save(output_path, quality=95)
442
+
443
+ return str(output_path)
444
+ except Exception as e:
445
+ logger.error(f"QR generation error: {e}")
446
+ return ""
447
+
448
+ def generate_qr_codes(data: Union[str, Dict, List], combined: bool = True) -> List[str]:
449
+ """Generate QR codes with enhanced visual appeal and metadata"""
450
  try:
451
+ file_processor = EnhancedFileProcessor()
452
+ paths = []
453
+
454
+ if combined:
455
+ # Process combined data
456
+ chunks = file_processor.chunk_data(data)
457
+ for i, chunk in enumerate(chunks):
458
+ filename = f'combined_qr_{int(time.time())}_{i+1}_of_{len(chunks)}.png'
459
+ qr_path = generate_stylish_qr(
460
+ data=chunk,
461
+ filename=filename,
462
+ fill_color="#1a365d", # Deep blue
463
+ back_color="#ffffff"
464
+ )
465
+ if qr_path:
466
+ paths.append(qr_path)
 
 
 
 
467
  else:
468
+ # Process individual items
469
+ if isinstance(data, list):
470
+ for idx, item in enumerate(data):
471
+ chunks = file_processor.chunk_data(item)
472
+ for chunk_idx, chunk in enumerate(chunks):
473
+ filename = f'item_{idx+1}_chunk_{chunk_idx+1}_of_{len(chunks)}_{int(time.time())}.png'
474
+ qr_path = generate_stylish_qr(
475
+ data=chunk,
476
+ filename=filename,
477
+ fill_color="#1a365d", # Deep blue
478
+ back_color="#ffffff"
479
+ )
480
+ if qr_path:
481
+ paths.append(qr_path)
482
  else:
483
+ chunks = file_processor.chunk_data(data)
484
+ for i, chunk in enumerate(chunks):
485
+ filename = f'single_qr_{i+1}_of_{len(chunks)}_{int(time.time())}.png'
486
+ qr_path = generate_stylish_qr(
487
+ data=chunk,
488
+ filename=filename,
489
+ fill_color="#1a365d", # Deep blue
490
+ back_color="#ffffff"
491
+ )
492
+ if qr_path:
493
+ paths.append(qr_path)
494
+ return paths
495
+ except Exception as e:
496
+ logger.error(f"QR code generation error: {e}")
497
+ return []
498
 
499
+ def create_modern_interface():
500
+ """Create a modern and visually appealing Gradio interface"""
501
 
502
+ # Modern CSS styling
503
+ css = """
504
+ /* Modern color scheme */
505
+ :root {
506
+ --primary-color: #1a365d;
507
+ --secondary-color: #2d3748;
508
+ --accent-color: #4299e1;
509
+ --background-color: #f7fafc;
510
+ --success-color: #48bb78;
511
+ --error-color: #f56565;
512
+ --warning-color: #ed8936;
513
+ }
514
+ /* Container styling */
515
+ .container {
516
+ max-width: 1200px;
517
+ margin: auto;
518
+ padding: 2rem;
519
+ background-color: var(--background-color);
520
+ border-radius: 1rem;
521
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
522
+ }
523
+ /* Component styling */
524
+ .input-container {
525
+ background-color: white;
526
+ padding: 1.5rem;
527
+ border-radius: 0.5rem;
528
+ border: 1px solid #e2e8f0;
529
+ margin-bottom: 1rem;
530
+ }
531
+ /* Button styling */
532
+ .primary-button {
533
+ background-color: var(--primary-color);
534
+ color: white;
535
+ padding: 0.75rem 1.5rem;
536
+ border-radius: 0.375rem;
537
+ border: none;
538
+ cursor: pointer;
539
+ transition: all 0.2s;
540
+ }
541
+ .primary-button:hover {
542
+ background-color: var(--accent-color);
543
+ transform: translateY(-1px);
544
+ }
545
+ /* Status messages */
546
+ .status {
547
+ padding: 1rem;
548
+ border-radius: 0.375rem;
549
+ margin: 1rem 0;
550
+ }
551
+ .status.success { background-color: #f0fff4; color: var(--success-color); }
552
+ .status.error { background-color: #fff5f5; color: var(--error-color); }
553
+ .status.warning { background-color: #fffaf0; color: var(--warning-color); }
554
+ /* Gallery styling */
555
+ .gallery {
556
+ display: grid;
557
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
558
+ gap: 1rem;
559
+ padding: 1rem;
560
+ background-color: white;
561
+ border-radius: 0.5rem;
562
+ border: 1px solid #e2e8f0;
563
+ }
564
+ .gallery img {
565
+ width: 100%;
566
+ height: auto;
567
+ border-radius: 0.375rem;
568
+ transition: transform 0.2s;
569
+ }
570
+ .gallery img:hover {
571
+ transform: scale(1.05);
572
+ }
573
+ /* QR Code Viewport Styling */
574
+ .viewport-container {
575
+ display: grid;
576
+ gap: 0.5rem;
577
+ padding: 1rem;
578
+ background-color: white;
579
+ border-radius: 0.5rem;
580
+ border: 1px solid #e2e8f0;
581
+ margin-top: 1rem;
582
+ }
583
+ .viewport-item {
584
+ display: flex;
585
+ flex-direction: column;
586
+ align-items: center;
587
+ }
588
+ .viewport-item img {
589
+ width: 100%;
590
+ height: auto;
591
+ border-radius: 0.375rem;
592
+ transition: transform 0.2s;
593
+ max-width: 150px; /* Adjust as needed */
594
+ max-height: 150px; /* Adjust as needed */
595
+ }
596
+ """
597
+ # Create interface with modern design
598
+ with gr.Blocks(css=css, title="Advanced Data Processor & QR Generator") as interface:
599
+ qr_code_paths = gr.State([])
600
+ gr.Markdown("""
601
+ # 🌐 Advanced Data Processing & QR Code Generator
602
+
603
+ Transform your data into beautifully designed, sequenced QR codes with our cutting-edge processor.
604
+ """)
605
+ with gr.Tab("πŸ“ URL Processing"):
606
+ url_input = gr.Textbox(
607
+ label="Enter URLs (comma or newline separated)",
608
+ lines=5,
609
+ placeholder="https://example1.com\nhttps://example2.com",
610
+ value=""
611
  )
612
+ with gr.Tab("πŸ“ File Input"):
613
  file_input = gr.File(
614
  label="Upload Files",
615
+ file_types=["*"], # Accept all file types
616
  file_count="multiple"
617
  )
618
+ with gr.Tab("πŸ“‹ JSON Input"):
619
+ text_input = gr.TextArea(
620
+ label="Direct JSON Input",
621
+ lines=15,
622
+ placeholder="Paste your JSON data here...",
623
+ value=""
 
 
 
624
  )
625
+ with gr.Row():
626
+ example_btn = gr.Button("πŸ“ Load Example", variant="secondary")
627
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", variant="secondary")
628
+ with gr.Row():
629
+ combine_data = gr.Checkbox(
630
+ label="Combine all data into sequence",
631
+ value=True,
632
+ info="Generate sequential QR codes for combined data"
633
  )
634
+ process_btn = gr.Button(
635
+ "πŸ”„ Process & Generate QR",
636
+ variant="primary"
 
637
  )
638
+ # Output components
639
+ output_json = gr.JSON(label="Processed Data")
640
+ output_gallery = gr.Gallery(
641
+ label="Generated QR Codes",
642
+ columns=3,
643
+ height=400,
644
+ show_label=True
645
+ )
646
+ output_text = gr.Textbox(
647
+ label="Processing Status",
648
+ interactive=False
649
+ )
650
+
651
+ with gr.Tab("πŸ–ΌοΈ QR Code Viewport") as viewport_tab:
652
+ viewport_output = gr.HTML(label="QR Code Sequence Viewport")
653
+ enabled_qr_codes = gr.State([]) # To store the enabled/disabled state
654
+
655
+ # Load example data
656
+ def load_example():
657
+ example = {
658
+ "type": "product_catalog",
659
+ "items": [
660
+ {
661
+ "id": "123",
662
+ "name": "Premium Widget",
663
+ "description": "High-quality widget with advanced features",
664
+ "price": 299.99,
665
+ "category": "electronics",
666
+ "tags": ["premium", "featured", "new"]
667
+ },
668
+ {
669
+ "id": "456",
670
+ "name": "Basic Widget",
671
+ "description": "Reliable widget for everyday use",
672
+ "price": 149.99,
673
+ "category": "electronics",
674
+ "tags": ["basic", "popular"]
675
+ }
676
+ ],
677
+ "metadata": {
678
+ "timestamp": datetime.now().isoformat(),
679
+ "version": "2.0",
680
+ "source": "example"
681
+ }
682
+ }
683
+ return json.dumps(example, indent=2)
684
+
685
+ def clear_input():
686
+ return ""
687
+
688
+ def update_viewport(paths, enabled_states):
689
+ if not paths:
690
+ return "<p>No QR codes generated yet.</p>"
691
+
692
+ num_qr_codes = len(paths)
693
+ cols = math.ceil(math.sqrt(num_qr_codes))
694
+ rows = math.ceil(num_qr_codes / cols)
695
+
696
+ viewport_html = '<div class="viewport-container" style="grid-template-columns: repeat({}, 1fr);">'.format(cols)
697
+
698
+ for i, path in enumerate(paths):
699
+ is_enabled = i in enabled_states
700
+ border = "border: 2px solid green;" if is_enabled else "border: 2px solid lightgray;"
701
+ viewport_html += f'<div class="viewport-item" id="qr_item_{i}">'
702
+ viewport_html += f'<img src="{path}" style="{border}" alt="QR Code {i+1}">'
703
+ viewport_html += f'<input type="checkbox" id="enable_qr_{i}" data-index="{i}" {"checked" if is_enabled else ""} onchange="updateEnabledStates(this)"> Enable'
704
+ viewport_html += '</div>'
705
+ viewport_html += '</div>'
706
+
707
+ return viewport_html
708
+
709
+ def process_inputs(urls, files, text, combine):
710
+ """Process all inputs and generate QR codes"""
711
+ try:
712
+ results = []
713
+ url_processor = EnhancedURLProcessor()
714
+ file_processor = EnhancedFileProcessor()
715
+
716
+ # Process JSON input
717
+ if text and text.strip():
718
+ try:
719
+ json_data = json.loads(text)
720
+ if isinstance(json_data, list):
721
+ results.extend(json_data)
722
+ else:
723
+ results.append(json_data)
724
+ except json.JSONDecodeError as e:
725
+ return None, [], f"❌ Invalid JSON format: {str(e)}"
726
+
727
+ # Process URLs
728
+ if urls and urls.strip():
729
+ url_list = re.split(r'[,\n]', urls)
730
+ url_list = [url.strip() for url in url_list if url.strip()]
731
+ for url in url_list:
732
+ validation = url_processor.validate_url(url)
733
+ if validation['is_valid']:
734
+ content = url_processor.fetch_content(url)
735
+ if content:
736
+ results.append({
737
+ 'source': 'url',
738
+ 'url': url,
739
+ 'content': content,
740
+ 'timestamp': datetime.now().isoformat()
741
+ })
742
+
743
+ # Process files
744
+ if files:
745
+ for file in files:
746
+ file_results = file_processor.process_file(file)
747
+ if file_results:
748
+ results.extend(file_results)
749
+
750
+ # Generate QR codes
751
+ if results:
752
+ qr_paths = generate_qr_codes(results, combine)
753
+ if qr_paths:
754
+ return (
755
+ results,
756
+ [str(path) for path in qr_paths],
757
+ f"βœ… Successfully processed {len(results)} items and generated {len(qr_paths)} QR codes!"
758
+ )
759
+ else:
760
+ return None, [], "❌ Failed to generate QR codes"
761
+ else:
762
+ return None, [], "⚠️ No valid content to process"
763
+ except Exception as e:
764
+ logger.error(f"Processing error: {e}")
765
+ return None, [], f"❌ Error: {str(e)}"
766
+
767
+ def on_qr_generation(results, qr_paths):
768
+ return qr_paths, qr_paths # Update state with generated paths
769
+
770
+ process_btn.click(
771
+ process_inputs,
772
+ inputs=[url_input, file_input, text_input, combine_data],
773
+ outputs=[output_json, output_gallery, output_text]
774
+ ).then(on_qr_generation, inputs=[output_json, output_gallery], outputs=[qr_code_paths, viewport_output])
775
+
776
+ viewport_tab.select(update_viewport, inputs=[qr_code_paths, enabled_qr_codes], outputs=[viewport_output])
777
+
778
+ # Add helpful documentation
779
+ gr.Markdown("""
780
+ ### πŸš€ Features
781
+ - **Complete URL Scraping**: Extracts every character from web pages
782
+ - **Advanced File Processing**: Full content extraction from various text-based files and common archives. Supports flexible JSON handling.
783
+ - **Smart JSON Handling**: Processes any size JSON with automatic chunking, either via direct input or file upload.
784
+ - **Sequential QR Codes**: Maintains data integrity across multiple codes
785
+ - **QR Code Viewport**: Visualize generated QR codes in a sequenced square, with options to enable/disable individual codes.
786
+ - **Modern Design**: Clean, responsive interface with visual feedback
787
+
788
+ ### πŸ’‘ Tips
789
+ 1. **URLs**: Enter multiple URLs separated by commas or newlines
790
+ 2. **Files**: Upload any type of file. The processor will attempt to handle supported text-based files, archives (.zip, .tar, .gz), and JSON files.
791
+ 3. **JSON**: Use the example button to see the expected format or upload a .json file. The system will also try to detect JSON content in other file types.
792
+ 4. **QR Codes**: Choose whether to combine data into sequential codes
793
+ 5. **Processing**: Monitor the status for real-time feedback
794
+
795
+ ### 🎨 Output
796
+ - Generated QR codes are saved in the `output/qr_codes` directory
797
+ - Each QR code contains metadata for proper sequencing
798
+ - Hover over QR codes in the gallery to see details
799
+ - The **QR Code Viewport** tab displays the generated QR codes in a grid.
800
+
801
+ ### βš™οΈ QR Code Viewport Instructions
802
+ 1. Navigate to the **QR Code Viewport** tab after generating QR codes.
803
+ 2. The generated QR codes will be displayed in a square arrangement.
804
+ 3. Use the checkboxes below each QR code to enable or disable it.
805
+ 4. The visualization will update to reflect the enabled/disabled state (currently by a green border).
806
+ """)
807
+ return interface
808
+
809
+ def main():
810
+ """Initialize and launch the application"""
811
+ try:
812
+ # Configure system settings
813
+ mimetypes.init()
814
+
815
+ # Create and launch interface
816
+ interface = create_modern_interface()
817
+
818
+ # Launch with configuration
819
+ interface.launch(
820
+ share=False,
821
+ debug=False,
822
+ show_error=True,
823
+ show_api=False
824
+ )
825
+ except Exception as e:
826
+ logger.error(f"Application startup error: {e}")
827
+ raise
828
+
829
+ if __name__ == "__main__":
830
+ main()