Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 24

Commit

f5f3613

verified ·

1 Parent(s): 8b33e0b

Update app2.py

Browse files

Files changed (1) hide show

app2.py +152 -255

app2.py CHANGED Viewed

@@ -555,7 +555,7 @@ class FileProcessor:
                     qr.add_data(json_str)
                     qr.make(fit=True)
-                    img = qr.make_image(fill_color="black", back_color="white")
                     output_path = output_dir / f'combined_qr_{int(time.time())}.png'
                     img.save(str(output_path))
                     return [str(output_path)]
@@ -686,6 +686,7 @@ def datachat_interface(mode: str, data_source: str, json_input: str, qr_image: s
     else:
         return "Invalid mode selected."
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
@@ -695,260 +696,22 @@ def create_interface():
     .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
     """
-    with gr.Blocks(css=css, title="Advanced Data Processor & QR Code Generator") as interface:
-        gr.Markdown("# 🌐 Advanced Data Processing & QR Code Generator")
-        # URL Extraction Tab
-        with gr.Tab("URL Extraction"):
-            url_input = gr.Textbox(label="URL to Process", placeholder="https://example.com")
-            depth_slider = gr.Slider(minimum=1, maximum=5, value=1, step=1, label="Crawl Depth (Higher values may affect performance)")
-            respect_robots = gr.Checkbox(label="Respect robots.txt", value=True)
-            extract_btn = gr.Button("Extract Content")
-            url_output = gr.JSON(label="Extracted Data")
-            download_btn = gr.Button("Download Results as ZIP")
-            download_output = gr.File(label="Download")
-            # Warning about depth
-            gr.Markdown("""
-            <div class="warning">
-            ⚠️ <strong>Warning:</strong> Higher depth values (>2) may significantly increase processing time and resource usage.
-            </div>
-            """)
-            # URL processor instance
-            url_processor = URLProcessor()
-            def process_url(url, depth, respect_robots):
-                url_processor.respect_robots = respect_robots
-                results = []
-                try:
-                    # Validate URL
-                    validation = url_processor.validate_url(url)
-                    if not validation['is_valid']:
-                        return {"error": validation['message']}
-                    # Process with depth
-                    processed_urls = set()
-                    urls_to_process = [(url, 0)]  # (url, current_depth)
-                    while urls_to_process:
-                        current_url, current_depth = urls_to_process.pop(0)
-                        if current_url in processed_urls:
-                            continue
-                        processed_urls.add(current_url)
-                        content = url_processor.fetch_content(current_url)
-                        if content:
-                            results.append({
-                                "url": current_url,
-                                "content": content.get('content', ''),
-                                "content_type": content.get('content_type', ''),
-                                "timestamp": datetime.now().isoformat()
-                            })
-                        # If we haven't reached max depth, extract and queue more URLs
-                        if current_depth < depth:
-                            soup = BeautifulSoup(content.get('content', ''), 'html.parser')
-                            for link in soup.find_all('a', href=True):
-                                next_url = link['href']
-                                if next_url.startswith('/'):
-                                    # Convert relative URL to absolute
-                                    from urllib.parse import urlparse, urljoin
-                                    parsed_url = urlparse(current_url)
-                                    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
-                                    next_url = urljoin(base_url, next_url)
-                                if validators.url(next_url) and next_url not in processed_urls:
-                                    urls_to_process.append((next_url, current_depth + 1))
-                    return results
-                except Exception as e:
-                    logger.error(f"URL processing error: {e}")
-                    return {"error": str(e)}
-            def create_download_zip(results):
-                if not results or (isinstance(results, dict) and 'error' in results):
-                    return None
-                try:
-                    # Create a temporary zip file
-                    with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
-                        with zipfile.ZipFile(tmp.name, 'w') as zipf:
-                            # Add JSON data
-                            zipf.writestr('extracted_data.json', json.dumps(results, indent=2))
-                            # Add individual text files for each URL
-                            for idx, item in enumerate(results):
-                                if 'content' in item:
-                                    zipf.writestr(f'content_{idx}_{int(time.time())}.txt', item['content'])
-                        return tmp.name
-                except Exception as e:
-                    logger.error(f"Error creating ZIP file: {e}")
-                    return None
-            extract_btn.click(process_url, [url_input, depth_slider, respect_robots], url_output)
-            download_btn.click(create_download_zip, [url_output], download_output)
-        # ZIP File Extractor Tab
-        with gr.Tab("ZIP File Extractor"):
-            zip_file_input = gr.File(label="Upload ZIP File")
-            extract_zip_btn = gr.Button("Extract and Process")
-            zip_output = gr.JSON(label="Extracted Data")
-            zip_qr_btn = gr.Button("Generate QR Code")
-            zip_qr_output = gr.Image(label="QR Code")
-            file_processor = FileProcessor()
-            def process_zip_file(file):
-                if not file:
-                    return {"error": "No file uploaded"}
-                try:
-                    results = file_processor.process_file(file)
-                    return results
-                except Exception as e:
-                    logger.error(f"ZIP processing error: {e}")
-                    return {"error": str(e)}
-            def generate_zip_qr(data):
-                if not data or (isinstance(data, dict) and 'error' in data):
-                    return None
-                try:
-                    return file_processor.generate_qr_code(data, combined=True)[0]
-                except Exception as e:
-                    logger.error(f"QR generation error: {e}")
-                    return None
-            extract_zip_btn.click(process_zip_file, [zip_file_input], zip_output)
-            zip_qr_btn.click(generate_zip_qr, [zip_output], zip_qr_output)
-        # Raw Text to JSON Tab
-        with gr.Tab("Text to JSON"):
-            text_input = gr.Textbox(lines=10, label="Raw Text Input")
-            json_structure = gr.Dropdown(
-                choices=["Simple", "Structured", "Key-Value Pairs"],
-                label="JSON Structure",
-                value="Simple"
-            )
-            convert_btn = gr.Button("Convert to JSON")
-            json_output = gr.JSON(label="JSON Output")
-            combine_json_btn = gr.Button("Combine with Previous JSON")
-            previous_json = gr.Textbox(lines=5, label="Previous JSON (Optional)")
-            combined_output = gr.JSON(label="Combined JSON")
-            text_qr_btn = gr.Button("Generate QR Code")
-            text_qr_output = gr.Image(label="QR Code")
-            def convert_text_to_json(text, structure):
-                if not text.strip():
-                    return {"error": "No text provided"}
-                try:
-                    if structure == "Simple":
-                        return {
-                            "text": text,
-                            "timestamp": datetime.now().isoformat()
-                        }
-                    elif structure == "Structured":
-                        lines = text.split('\n')
-                        paragraphs = []
-                        current_para = []
-                        for line in lines:
-                            if line.strip():
-                                current_para.append(line)
-                            elif current_para:
-                                paragraphs.append(' '.join(current_para))
-                                current_para = []
-                        if current_para:
-                            paragraphs.append(' '.join(current_para))
-                        return {
-                            "title": paragraphs[0] if paragraphs else "",
-                            "paragraphs": paragraphs[1:] if len(paragraphs) > 1 else [],
-                            "timestamp": datetime.now().isoformat()
-                        }
-                    elif structure == "Key-Value Pairs":
-                        pairs = {}
-                        lines = text.split('\n')
-                        for line in lines:
-                            if ':' in line:
-                                key, value = line.split(':', 1)
-                                pairs[key.strip()] = value.strip()
-                        pairs["timestamp"] = datetime.now().isoformat()
-                        return pairs
-                    return {"error": "Invalid structure selected"}
-                except Exception as e:
-                    logger.error(f"Text to JSON conversion error: {e}")
-                    return {"error": str(e)}
-            def combine_json_data(current, previous):
-                if not current or (isinstance(current, dict) and 'error' in current):
-                    return {"error": "No valid current JSON"}
-                try:
-                    if not previous.strip():
-                        return current
-                    prev_json = json.loads(previous)
-                    # Determine how to combine based on types
-                    if isinstance(prev_json, list) and isinstance(current, list):
-                        return prev_json + current
-                    elif isinstance(prev_json, list):
-                        return prev_json + [current]
-                    elif isinstance(current, list):
-                        return [prev_json] + current
-                    else:
-                        # Both are objects, merge them
-                        combined = {**prev_json, **current}
-                        # Add a combined timestamp
-                        combined["combined_timestamp"] = datetime.now().isoformat()
-                        return combined
-                except json.JSONDecodeError:
-                    return {"error": "Previous JSON is invalid"}
-                except Exception as e:
-                    logger.error(f"JSON combination error: {e}")
-                    return {"error": str(e)}
-            convert_btn.click(convert_text_to_json, [text_input, json_structure], json_output)
-            combine_json_btn.click(combine_json_data, [json_output, previous_json], combined_output)
-            text_qr_btn.click(generate_zip_qr, [json_output], text_qr_output)
-        # DataChat Tab (existing)
-        with gr.Tab("DataChat"):
-            mode = gr.Radio(["Trained with Data", "Chat about Data"], label="Mode")
-            data_source = gr.Radio(["JSON Input", "QR Code"], label="Data Source")
-            json_input = gr.Textbox(lines=8, label="JSON Data")
-            qr_image = gr.Image(label="QR Code Image", type="filepath")
-            query = gr.Textbox(label="Query")
-            submit_btn = gr.Button("Submit")
-            output = gr.Textbox(label="Response")
-            submit_btn.click(datachat_interface, [mode, data_source, json_input, qr_image, query], output)
-        # QR Generator Tab (existing)
-        with gr.Tab("QR Generator"):
-            qr_input = gr.Textbox(lines=8, label="Input JSON for QR")
-            generate_btn = gr.Button("Generate QR")
-            qr_output = gr.Image(label="Generated QR Code")
-            def generate_qr(json_data):
-                data = file_processor.clean_json(json_data)
-                if data:
-                    return file_processor.generate_qr_code(data)
-                return None
-            generate_btn.click(generate_qr, qr_input, qr_output)
     return interface
 def main():
@@ -966,3 +729,137 @@ def main():
 if __name__ == "__main__":
     main()

                     qr.add_data(json_str)
                     qr.make(fit=True)
+                    img = qrcode.make_image(fill_color="black", back_color="white")
                     output_path = output_dir / f'combined_qr_{int(time.time())}.png'
                     img.save(str(output_path))
                     return [str(output_path)]
     else:
         return "Invalid mode selected."
+# Replace the create_interface function with this version
 def create_interface():
     """Create a comprehensive Gradio interface with advanced features"""
     css = """
     .success { background-color: #d4edda; color: #155724; padding: 10px; border-radius: 4px; }
     """
+    # Use Interface instead of Blocks
+    interface = gr.Interface(
+        fn=datachat_interface,
+        inputs=[
+            gr.Radio(["Trained with Data", "Chat about Data"], label="Mode"),
+            gr.Radio(["JSON Input", "QR Code"], label="Data Source"),
+            gr.Textbox(lines=8, label="JSON Data"),
+            gr.Image(label="QR Code Image", type="filepath"),
+            gr.Textbox(label="Query")
+        ],
+        outputs=gr.Textbox(label="Response"),
+        title="Advanced Data Processor & QR Code Generator",
+        description="# 🌐 Advanced Data Processing & QR Code Generator",
+        css=css
+    )
     return interface
 def main():
 if __name__ == "__main__":
     main()
+def create_download_zip(results):
+    if not results or (isinstance(results, dict) and 'error' in results):
+        return None
+    try:
+        # Create a temporary zip file
+        with tempfile.NamedTemporaryFile(suffix='.zip', delete=False) as tmp:
+            with zipfile.ZipFile(tmp.name, 'w') as zipf:
+                # Add JSON data
+                zipf.writestr('extracted_data.json', json.dumps(results, indent=2))
+                # Add individual files for each URL
+                for idx, item in enumerate(results):
+                    if 'html' in item:
+                        zipf.writestr(f'content_{idx}_full.html', item['html'])
+                    if 'content' in item:
+                        zipf.writestr(f'content_{idx}_text.txt', item['content'])
+                    # Download and include images
+                    if 'images' in item and item['images']:
+                        img_dir = f'content_{idx}_images'
+                        for img_idx, img in enumerate(item['images']):
+                            try:
+                                img_url = img['src']
+                                if validators.url(img_url):
+                                    img_response = requests.get(img_url, timeout=10)
+                                    if img_response.status_code == 200:
+                                        # Extract file extension from URL or content type
+                                        content_type = img_response.headers.get('Content-Type', '')
+                                        ext = '.jpg'  # Default extension
+                                        if 'png' in content_type:
+                                            ext = '.png'
+                                        elif 'gif' in content_type:
+                                            ext = '.gif'
+                                        elif 'svg' in content_type:
+                                            ext = '.svg'
+                                        zipf.writestr(f'{img_dir}/image_{img_idx}{ext}', img_response.content)
+                            except Exception as e:
+                                logger.error(f"Error downloading image {img_url}: {e}")
+                    # Include scripts
+                    if 'scripts' in item and item['scripts']:
+                        scripts_dir = f'content_{idx}_scripts'
+                        for script_idx, script in enumerate(item['scripts']):
+                            if script:
+                                zipf.writestr(f'{scripts_dir}/script_{script_idx}.js', script)
+                    # Include styles
+                    if 'styles' in item and item['styles']:
+                        styles_dir = f'content_{idx}_styles'
+                        for style_idx, style in enumerate(item['styles']):
+                            if style:
+                                zipf.writestr(f'{styles_dir}/style_{style_idx}.css', style)
+                    # Include links as a separate file
+                    if 'links' in item and item['links']:
+                        links_content = "URL,Text\n"
+                        for link in item['links']:
+                            links_content += f"\"{link['url']}\",\"{link['text']}\"\n"
+                        zipf.writestr(f'content_{idx}_links.csv', links_content)
+                    # Include tables as CSV files
+                    if 'tables' in item and item['tables']:
+                        tables_dir = f'content_{idx}_tables'
+                        for table_idx, table in enumerate(item['tables']):
+                            table_content = ""
+                            for row in table:
+                                table_content += ",".join([f"\"{cell}\"" for cell in row]) + "\n"
+                            zipf.writestr(f'{tables_dir}/table_{table_idx}.csv', table_content)
+                # Create an index.html file for easy navigation
+                index_html = """
+                <!DOCTYPE html>
+                <html>
+                <head>
+                    <title>Extracted Content</title>
+                    <style>
+                        body { font-family: Arial, sans-serif; margin: 20px; }
+                        h1 { color: #333; }
+                        .url-item { margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; }
+                        .url-title { font-weight: bold; }
+                        .resource-list { margin-left: 20px; }
+                    </style>
+                </head>
+                <body>
+                    <h1>Extracted Content</h1>
+                """
+                for idx, item in enumerate(results):
+                    index_html += f"""
+                    <div class="url-item">
+                        <div class="url-title">{idx + 1}. {item.get('url', 'Unknown URL')}</div>
+                        <div>Title: {item.get('title', 'No title')}</div>
+                        <div>Timestamp: {item.get('timestamp', '')}</div>
+                        <div class="resource-list">
+                            <p><a href="content_{idx}_full.html">Full HTML</a></p>
+                            <p><a href="content_{idx}_text.txt">Text Content</a></p>
+                    """
+                    if 'images' in item and item['images']:
+                        index_html += f"""
+                            <p>Images: {len(item['images'])} found</p>
+                        """
+                    if 'links' in item and item['links']:
+                        index_html += f"""
+                            <p>Links: <a href="content_{idx}_links.csv">{len(item['links'])} found</a></p>
+                        """
+                    if 'tables' in item and item['tables']:
+                        index_html += f"""
+                            <p>Tables: {len(item['tables'])} found</p>
+                        """
+                    index_html += """
+                        </div>
+                    </div>
+                    """
+                index_html += """
+                </body>
+                </html>
+                """
+                zipf.writestr('index.html', index_html)
+            return tmp.name
+    except Exception as e:
+        logger.error(f"Error creating ZIP file: {e}")
+        return None