Spaces:

acecalisto3
/

urld

Running

App Files Files Community

acecalisto3 commited on Mar 27

Commit

8433748

verified ·

1 Parent(s): e784699

Update app2.py

Browse files

Files changed (1) hide show

app2.py +680 -680

app2.py CHANGED Viewed

@@ -1,710 +1,710 @@
-"""
-Advanced URL & Text Processing Suite - Professional Edition
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Enterprise-grade application with advanced features, real-time analytics,
-parallel processing, and sophisticated UI components.
-"""
 import gradio as gr
-import logging
-import json
 import os
-import sys
-import threading
-import queue
-import time
-from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
-from datetime import datetime
-from pathlib import Path
-from typing import Dict, List, Optional, Union, Any, Tuple
-from dataclasses import dataclass, asdict
-import numpy as np
-import pandas as pd
-# Configure advanced logging with rotation
-from logging.handlers import RotatingFileHandler
-log_handler = RotatingFileHandler(
-    'urld_pro.log',
-    maxBytes=10*1024*1024,  # 10MB
-    backupCount=5
-)
-log_handler.setFormatter(
-    logging.Formatter('%(asctime)s.%(msecs)03d [%(levelname)s] %(name)s - %(message)s')
 )
-logger = logging.getLogger(__name__)
-logger.addHandler(log_handler)
-logger.setLevel(logging.INFO)
-# Advanced Theme Configuration
-THEME = gr.themes.Soft(
-    primary_hue=gr.themes.colors.indigo,
-    secondary_hue=gr.themes.colors.blue,
-    neutral_hue=gr.themes.colors.slate,
-    spacing_size=gr.themes.sizes.spacing_lg,
-    radius_size=gr.themes.sizes.radius_lg,
-    text_size=gr.themes.sizes.text_lg,
-).set(
-    body_background_fill="*background_fill_secondary",
-    button_primary_background_fill="*primary_500",
-    button_primary_background_fill_hover="*primary_600",
-    button_primary_text_color="white",
-    button_primary_border_color="*primary_500",
-    button_secondary_background_fill="*secondary_500",
-    button_secondary_background_fill_hover="*secondary_600",
-    button_secondary_text_color="white",
-    button_secondary_border_color="*secondary_500",
 )
-# Enhanced CSS with advanced animations and modern design
-CUSTOM_CSS = """
-@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
-:root {
-    --primary-color: #4f46e5;
-    --secondary-color: #2563eb;
-    --success-color: #059669;
-    --error-color: #dc2626;
-    --warning-color: #d97706;
-    --info-color: #2563eb;
-}
-body {
-    font-family: 'Inter', sans-serif;
-}
-.container {
-    max-width: 1400px !important;
-    margin: auto !important;
-    padding: 2rem !important;
-}
-.pro-header {
-    text-align: center;
-    margin-bottom: 2rem;
-    padding: 2rem;
-    background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
-    border-radius: 1rem;
-    color: white;
-    box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1), 0 2px 4px -2px rgb(0 0 0 / 0.1);
-}
-.pro-header h1 {
-    font-size: 2.5rem;
-    font-weight: 700;
-    margin-bottom: 1rem;
-}
-.pro-header p {
-    font-size: 1.25rem;
-    opacity: 0.9;
-}
-.dashboard {
-    display: grid;
-    grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
-    gap: 1rem;
-    margin-bottom: 2rem;
-}
-.stat-card {
-    background: white;
-    padding: 1.5rem;
-    border-radius: 1rem;
-    box-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1), 0 1px 2px -1px rgb(0 0 0 / 0.1);
-    transition: transform 0.2s ease;
-}
-.stat-card:hover {
-    transform: translateY(-2px);
-}
-.chart-container {
-    background: white;
-    padding: 1rem;
-    border-radius: 1rem;
-    box-shadow: 0 1px 3px 0 rgb(0 0 0 / 0.1);
-    margin-bottom: 1rem;
-}
-.pro-tab {
-    transition: all 0.3s ease;
-}
-.pro-tab:hover {
-    transform: translateY(-1px);
-}
-.pro-button {
-    transition: all 0.2s ease;
-}
-.pro-button:hover {
-    transform: translateY(-1px);
-    box-shadow: 0 4px 6px -1px rgb(0 0 0 / 0.1);
-}
-.pro-button:active {
-    transform: translateY(0);
-}
-.status-message {
-    padding: 1rem;
-    border-radius: 0.5rem;
-    margin: 1rem 0;
-    animation: slideIn 0.3s ease;
-}
-.status-message.success {
-    background: #ecfdf5;
-    border: 1px solid var(--success-color);
-    color: var(--success-color);
-}
-.status-message.error {
-    background: #fef2f2;
-    border: 1px solid var(--error-color);
-    color: var(--error-color);
-}
-.status-message.warning {
-    background: #fffbeb;
-    border: 1px solid var(--warning-color);
-    color: var(--warning-color);
-}
-@keyframes slideIn {
-    from {
-        opacity: 0;
-        transform: translateY(-10px);
-    }
-    to {
-        opacity: 1;
-        transform: translateY(0);
-    }
-}
-.loading {
-    position: relative;
-}
-.loading::after {
-    content: '';
-    position: absolute;
-    top: 0;
-    left: 0;
-    width: 100%;
-    height: 100%;
-    background: linear-gradient(
-        90deg,
-        rgba(255,255,255,0) 0%,
-        rgba(255,255,255,0.2) 50%,
-        rgba(255,255,255,0) 100%
-    );
-    animation: shimmer 1.5s infinite;
-}
-@keyframes shimmer {
-    0% {
-        transform: translateX(-100%);
-    }
-    100% {
-        transform: translateX(100%);
-    }
-}
-.pro-footer {
-    text-align: center;
-    margin-top: 3rem;
-    padding: 2rem;
-    background: #f8fafc;
-    border-radius: 1rem;
-    box-shadow: 0 -1px 3px 0 rgb(0 0 0 / 0.1);
-}
-"""
-@dataclass
-class ProcessingStats:
-    """Data class for tracking processing statistics"""
-    total_urls: int = 0
-    successful_urls: int = 0
-    failed_urls: int = 0
-    total_files: int = 0
-    successful_files: int = 0
-    failed_files: int = 0
-    total_qr_codes: int = 0
-    successful_qr_codes: int = 0
-    failed_qr_codes: int = 0
-    processing_time: float = 0.0
-    last_updated: str = datetime.now().isoformat()
-class AdvancedProInterface:
-    """Professional interface with advanced features and real-time analytics"""
-    def __init__(self):
-        """Initialize the professional interface with advanced components"""
-        self.url_processor = URLProcessor()
-        self.file_processor = FileProcessor()
-        self.qr_processor = QRProcessor()
-        self.stats = ProcessingStats()
-        self.processing_queue = queue.Queue()
-        self.thread_pool = ThreadPoolExecutor(max_workers=10)
-        self.process_pool = ProcessPoolExecutor(max_workers=4)
-        self.processing_history: List[Dict] = []
-        # Initialize real-time monitoring
-        self._start_monitoring()
-    def _start_monitoring(self):
-        """Start background monitoring thread"""
-        def monitor():
-            while True:
-                try:
-                    # Update statistics
-                    self.stats.last_updated = datetime.now().isoformat()
-                    # Process queued items
-                    while not self.processing_queue.empty():
-                        item = self.processing_queue.get_nowait()
-                        self._process_queued_item(item)
-                    time.sleep(1)
-                except Exception as e:
-                    logger.error(f"Monitoring error: {e}")
-        threading.Thread(target=monitor, daemon=True).start()
-    def _process_queued_item(self, item: Dict):
-        """Process queued items with advanced error handling"""
         try:
-            item_type = item.get('type')
-            if item_type == 'url':
-                self._process_url_item(item)
-            elif item_type == 'file':
-                self._process_file_item(item)
-            elif item_type == 'qr':
-                self._process_qr_item(item)
         except Exception as e:
-            logger.error(f"Queue processing error: {e}")
-    def _process_url_item(self, item: Dict):
-        """Process URL items with advanced features"""
-        try:
-            start_time = time.time()
-            results = self.url_processor.process_urls([item['url']], mode=item['mode'])
-            processing_time = time.time() - start_time
-            self.stats.total_urls += 1
-            if any('error' in r for r in results):
-                self.stats.failed_urls += 1
             else:
-                self.stats.successful_urls += 1
-            self.stats.processing_time += processing_time
-            # Update history
-            self.processing_history.append({
-                'timestamp': datetime.now().isoformat(),
-                'type': 'url',
-                'url': item['url'],
-                'results': results,
-                'processing_time': processing_time
-            })
-        except Exception as e:
-            logger.error(f"URL processing error: {e}")
-            self.stats.failed_urls += 1
-    async def process_urls_parallel(self, urls: str, mode: str) -> Tuple[str, str, str, Dict]:
-        """Process URLs in parallel with advanced features"""
         try:
-            url_list = [url.strip() for url in urls.split('\n') if url.strip()]
-            if not url_list:
-                return "", "⚠️ No valid URLs provided", "", {}
-            start_time = time.time()
-            # Process URLs in parallel
-            futures = []
-            for url in url_list:
-                future = self.thread_pool.submit(
-                    self.url_processor.process_urls,
-                    [url],
-                    mode=mode
                 )
-                futures.append(future)
-            # Collect results
-            results = []
-            for future in futures:
-                try:
-                    result = future.result(timeout=30)
-                    results.extend(result)
-                except Exception as e:
-                    logger.error(f"URL processing error: {e}")
-                    results.append({
-                        'error': str(e),
-                        'timestamp': datetime.now().isoformat()
-                    })
-            processing_time = time.time() - start_time
-            # Update statistics
-            self.stats.total_urls += len(url_list)
-            self.stats.successful_urls += len([r for r in results if 'error' not in r])
-            self.stats.failed_urls += len([r for r in results if 'error' in r])
-            self.stats.processing_time += processing_time
-            # Generate analytics
-            analytics = self._generate_analytics(results, processing_time)
-            # Format output
-            formatted_results = json.dumps(results, indent=2)
-            summary = self._generate_summary(results)
-            return (
-                formatted_results,
-                f"✅ Processed {len(url_list)} URLs in {processing_time:.2f}s",
-                summary,
-                analytics
             )
-        except Exception as e:
-            logger.error(f"Parallel processing error: {e}")
-            return "", f"❌ Error: {str(e)}", "", {}
-    def _generate_analytics(self, results: List[Dict], processing_time: float) -> Dict:
-        """Generate detailed analytics from processing results"""
-        analytics = {
-            'processing_time': processing_time,
-            'total_items': len(results),
-            'success_rate': len([r for r in results if 'error' not in r]) / len(results) if results else 0,
-            'error_rate': len([r for r in results if 'error' in r]) / len(results) if results else 0,
-            'average_time_per_item': processing_time / len(results) if results else 0,
-            'timestamp': datetime.now().isoformat()
-        }
-        # Add historical trends
-        if self.processing_history:
-            historical_success_rates = [
-                len([r for r in h['results'] if 'error' not in r]) / len(h['results'])
-                for h in self.processing_history[-10:]  # Last 10 operations
-            ]
-            analytics['historical_success_rate'] = sum(historical_success_rates) / len(historical_success_rates)
-        return analytics
-    def create_interface(self) -> gr.Blocks:
-        """Create an advanced professional interface with real-time analytics"""
-        with gr.Blocks(theme=THEME, css=CUSTOM_CSS) as interface:
-            # Professional Header
-            gr.Markdown(
-                """
-                <div class="pro-header">
-                    <h1>🌐 Advanced URL & Text Processing Suite - Pro</h1>
-                    <p>Enterprise-grade toolkit with advanced features and real-time analytics</p>
-                </div>
-                """
             )
-            # Real-time Dashboard
-            with gr.Row(elem_classes="dashboard"):
-                with gr.Column(elem_classes="stat-card"):
-                    url_stats = gr.JSON(
-                        value={
-                            "Total URLs": 0,
-                            "Success Rate": "0%",
-                            "Avg. Processing Time": "0ms"
-                        },
-                        label="URL Processing Stats"
-                    )
-                with gr.Column(elem_classes="stat-card"):
-                    file_stats = gr.JSON(
-                        value={
-                            "Total Files": 0,
-                            "Success Rate": "0%",
-                            "Avg. Processing Time": "0ms"
-                        },
-                        label="File Processing Stats"
-                    )
-                with gr.Column(elem_classes="stat-card"):
-                    qr_stats = gr.JSON(
-                        value={
-                            "Total QR Codes": 0,
-                            "Success Rate": "0%",
-                            "Avg. Processing Time": "0ms"
-                        },
-                        label="QR Code Stats"
-                    )
-            # Main Interface Tabs
-            with gr.Tabs() as tabs:
-                # Advanced URL Processing Tab
-                with gr.Tab("🔗 URL Processing", elem_classes="pro-tab"):
-                    with gr.Row():
-                        with gr.Column(scale=2):
-                            url_input = gr.Textbox(
-                                label="URLs",
-                                placeholder="Enter URLs (one per line)",
-                                lines=5
-                            )
-                            with gr.Row():
-                                mode = gr.Radio(
-                                    choices=["basic", "interactive", "deep"],
-                                    value="basic",
-                                    label="Processing Mode"
-                                )
-                                parallel = gr.Checkbox(
-                                    label="Enable Parallel Processing",
-                                    value=True
-                                )
-                            with gr.Row():
-                                process_btn = gr.Button(
-                                    "🚀 Process URLs",
-                                    elem_classes="pro-button"
-                                )
-                                clear_btn = gr.Button(
-                                    "🗑️ Clear",
-                                    elem_classes="pro-button"
-                                )
-                        with gr.Column(scale=1):
-                            gr.Markdown(
-                                """
-                                ### 📊 Processing Modes
-                                #### Basic Mode
-                                - Fast content retrieval
-                                - Basic metadata extraction
-                                - Suitable for simple URLs
-                                #### Interactive Mode
-                                - Handles JavaScript-rendered content
-                                - Social media support
-                                - Advanced rate limiting
-                                #### Deep Mode
-                                - Full content analysis
-                                - Link following
-                                - Comprehensive metadata
-                                """
-                            )
-                    with gr.Row():
-                        status_output = gr.Textbox(
-                            label="Status",
-                            interactive=False
-                        )
-                    with gr.Tabs():
-                        with gr.Tab("Results"):
-                            json_output = gr.JSON(
-                                label="Detailed Results"
-                            )
-                        with gr.Tab("Summary"):
-                            summary_output = gr.Textbox(
-                                label="Processing Summary",
-                                interactive=False,
-                                lines=10
-                            )
-                        with gr.Tab("Analytics"):
-                            analytics_output = gr.Plot(
-                                label="Processing Analytics"
-                            )
-                # Advanced File Processing Tab
-                with gr.Tab("📁 File Processing", elem_classes="pro-tab"):
-                    with gr.Row():
-                        with gr.Column(scale=2):
-                            file_input = gr.File(
-                                label="Upload Files",
-                                file_types=[
-                                    ".txt", ".pdf", ".doc", ".docx",
-                                    ".zip", ".tar.gz", ".jpg", ".png"
-                                ],
-                                multiple=True
-                            )
-                            with gr.Row():
-                                process_file_btn = gr.Button(
-                                    "📥 Process Files",
-                                    elem_classes="pro-button"
-                                )
-                                batch_size = gr.Slider(
-                                    minimum=1,
-                                    maximum=10,
-                                    value=3,
-                                    step=1,
-                                    label="Batch Size"
-                                )
-                        with gr.Column(scale=1):
-                            gr.Markdown(
-                                """
-                                ### 📑 Supported Formats
-                                #### Documents
-                                - PDF files (.pdf)
-                                - Word documents (.doc, .docx)
-                                - Text files (.txt)
-                                #### Archives
-                                - ZIP archives (.zip)
-                                - TAR archives (.tar.gz)
-                                #### Media
-                                - Images (.jpg, .png)
-                                - And more...
-                                """
-                            )
-                    with gr.Row():
-                        file_status = gr.Textbox(
-                            label="Status",
-                            interactive=False
-                        )
-                    with gr.Tabs():
-                        with gr.Tab("Results"):
-                            file_output = gr.File(
-                                label="Processed Files"
-                            )
-                        with gr.Tab("Details"):
-                            file_json_output = gr.JSON(
-                                label="Processing Details"
-                            )
-                        with gr.Tab("Analytics"):
-                            file_analytics = gr.Plot(
-                                label="File Processing Analytics"
-                            )
-                # Advanced QR Code Tab
-                with gr.Tab("📱 QR Code", elem_classes="pro-tab"):
-                    with gr.Row():
-                        with gr.Column(scale=2):
-                            qr_input = gr.Textbox(
-                                label="Data",
-                                placeholder="Enter data to encode",
-                                lines=3
-                            )
-                            with gr.Row():
-                                qr_size = gr.Slider(
-                                    minimum=5,
-                                    maximum=50,
-                                    value=10,
-                                    step=1,
-                                    label="QR Code Size"
-                                )
-                                qr_correction = gr.Dropdown(
-                                    choices=["L", "M", "Q", "H"],
-                                    value="M",
-                                    label="Error Correction"
-                                )
-                            with gr.Row():
-                                generate_qr_btn = gr.Button(
-                                    "✨ Generate QR",
-                                    elem_classes="pro-button"
-                                )
-                                customize_btn = gr.Button(
-                                    "🎨 Customize",
-                                    elem_classes="pro-button"
-                                )
-                        with gr.Column(scale=1):
-                            qr_output = gr.Image(
-                                label="Generated QR Code"
-                            )
-                            qr_status = gr.Textbox(
-                                label="Status",
-                                interactive=False
-                            )
-            # Professional Footer
-            gr.Markdown(
-                """
-                <div class="pro-footer">
-                    <p>Advanced URL & Text Processing Suite - Professional Edition</p>
-                    <p style="font-size: 0.9rem;">Version 1.0.0 Pro | © 2024 Advanced URL Processing Team</p>
-                </div>
-                """
             )
-            # Event Handlers
-            process_btn.click(
-                fn=self.process_urls_parallel,
-                inputs=[url_input, mode],
-                outputs=[
-                    json_output,
-                    status_output,
-                    summary_output,
-                    analytics_output
-                ]
             )
-            clear_btn.click(
-                fn=lambda: ("", "", "", None),
-                inputs=[],
-                outputs=[
-                    url_input,
-                    status_output,
-                    summary_output,
-                    analytics_output
-                ]
             )
-            process_file_btn.click(
-                fn=self.process_file,
-                inputs=[file_input],
-                outputs=[
-                    file_json_output,
-                    file_status,
-                    file_output
-                ]
             )
-            generate_qr_btn.click(
-                fn=self.generate_qr,
-                inputs=[qr_input, qr_size],
-                outputs=[qr_output, qr_status]
             )
-            # Update statistics periodically
-            gr.Markdown.update(every=5)
-        return interface
-def main():
-    """Main entry point with advanced error handling"""
-    try:
-        # Initialize interface
-        app = AdvancedProInterface()
-        interface = app.create_interface()
-        # Launch with professional configuration
-        interface.launch(
-            server_name="0.0.0.0",
-            server_port=8000,
-            share=False,
-            debug=True,
-            enable_queue=True,
-            max_threads=40,
-            auth=None,  # Add authentication if needed
-            ssl_keyfile=None,  # Add SSL if needed
-            ssl_certfile=None
-        )
-    except Exception as e:
-        logger.error(f"Application startup error: {e}", exc_info=True)
-        sys.exit(1)
-if __name__ == "__main__":
-    main()

 import gradio as gr
+#import urllib.request
+import requests
+import zipfile
+import uuid
+import bs4
+import lxml
 import os
+#import subprocess
+from huggingface_hub import InferenceClient,HfApi
+import random
+import json
+import datetime
+from pypdf import PdfReader
+import uuid
+#from query import tasks
+from agent import (
+    PREFIX,
+    COMPRESS_DATA_PROMPT,
+    COMPRESS_DATA_PROMPT_SMALL,
+    LOG_PROMPT,
+    LOG_RESPONSE,
 )
+client = InferenceClient(
+    "mistralai/Mixtral-8x7B-Instruct-v0.1"
 )
+reponame="acecalisto3/tmp"
+save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
+token_self = os.environ['HF_TOKEN']
+api=HfApi(token=token_self)
+def find_all(purpose, task, history, url, result, steps):
+    return_list = []
+    visited_links = set()
+    links_to_visit = [(url, 0)]
+    while links_to_visit:
+        current_url, current_depth = links_to_visit.pop(0)
+        if current_depth < steps:
+            try:
+                if current_url not in visited_links:
+                    visited_links.add(current_url)
+                    source = requests.get(current_url)
+                    if source.status_code == 200:
+                        soup = bs4.BeautifulSoup(source.content, 'lxml')
+                        rawp = f'RAW TEXT RETURNED: {soup.text}'
+                        return_list.append(rawp)
+                        for link in soup.find_all("a"):
+                            href = link.get('href')
+                            if href and href.startswith('http'):
+                                links_to_visit.append((href, current_depth + 1))
+            except Exception as e:
+                print(f"Error fetching {current_url}: {e}")
+    return True, return_list
+def read_txt(txt_path):
+    text=""
+    with open(txt_path,"r") as f:
+        text = f.read()
+    f.close()
+    print (text)
+    return text
+def read_pdf(pdf_path):
+    text=""
+    reader = PdfReader(f'{pdf_path}')
+    number_of_pages = len(reader.pages)
+    for i in range(number_of_pages):
+        page = reader.pages[i]
+        text = f'{text}\n{page.extract_text()}'
+    print (text)
+    return text
+error_box=[]
+def read_pdf_online(url):
+    uid=uuid.uuid4()
+    print(f"reading {url}")
+    response = requests.get(url, stream=True)
+    print(response.status_code)
+    text=""
+#################
+#####################
+    try:
+        if response.status_code == 200:
+            with open("test.pdf", "wb") as f:
+                f.write(response.content)
+            #f.close()
+            #out = Path("./data.pdf")
+            #print (out)
+            reader = PdfReader("test.pdf")
+            number_of_pages = len(reader.pages)
+            print(number_of_pages)
+            for i in range(number_of_pages):
+                page = reader.pages[i]
+                text = f'{text}\n{page.extract_text()}'
+                print(f"PDF_TEXT:: {text}")
+            return text
+        else:
+            text = response.status_code
+            error_box.append(url)
+            print(text)
+            return text
+    except Exception as e:
+        print (e)
+        return e
+VERBOSE = True
+MAX_HISTORY = 100
+MAX_DATA = 20000
+def format_prompt(message, history):
+  prompt = "<s>"
+  for user_prompt, bot_response in history:
+    prompt += f"[INST] {user_prompt} [/INST]"
+    prompt += f" {bot_response}</s> "
+  prompt += f"[INST] {message} [/INST]"
+  return prompt
+def run_gpt(
+    prompt_template,
+    stop_tokens,
+    max_tokens,
+    seed,
+    **prompt_kwargs,
+):
+    print(seed)
+    timestamp=datetime.datetime.now()
+    generate_kwargs = dict(
+        temperature=0.9,
+        max_new_tokens=max_tokens,
+        top_p=0.95,
+        repetition_penalty=1.0,
+        do_sample=True,
+        seed=seed,
+    )
+    content = PREFIX.format(
+        timestamp=timestamp,
+        purpose="Compile the provided data and complete the users task"
+    ) + prompt_template.format(**prompt_kwargs)
+    if VERBOSE:
+        print(LOG_PROMPT.format(content))
+    #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
+    #formatted_prompt = format_prompt(f'{content}', history)
+    stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
+    resp = ""
+    for response in stream:
+        resp += response.token.text
+        #yield resp
+    if VERBOSE:
+        print(LOG_RESPONSE.format(resp))
+    return resp
+def compress_data(c, instruct, history):
+    seed=random.randint(1,1000000000)
+    print (c)
+    #tot=len(purpose)
+    #print(tot)
+    divr=int(c)/MAX_DATA
+    divi=int(divr)+1 if divr != int(divr) else int(divr)
+    chunk = int(int(c)/divr)
+    print(f'chunk:: {chunk}')
+    print(f'divr:: {divr}')
+    print (f'divi:: {divi}')
+    out = []
+    #out=""
+    s=0
+    e=chunk
+    print(f'e:: {e}')
+    new_history=""
+    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
+    for z in range(divi):
+        print(f's:e :: {s}:{e}')
+        hist = history[s:e]
+        resp = run_gpt(
+            COMPRESS_DATA_PROMPT_SMALL,
+            stop_tokens=["observation:", "task:", "action:", "thought:"],
+            max_tokens=8192,
+            seed=seed,
+            direction=instruct,
+            knowledge="",
+            history=hist,
+        )
+        out.append(resp)
+        #new_history = resp
+        print (resp)
+        #out+=resp
+        e=e+chunk
+        s=s+chunk
+    return out
+def compress_data_og(c, instruct, history):
+    seed=random.randint(1,1000000000)
+    print (c)
+    #tot=len(purpose)
+    #print(tot)
+    divr=int(c)/MAX_DATA
+    divi=int(divr)+1 if divr != int(divr) else int(divr)
+    chunk = int(int(c)/divr)
+    print(f'chunk:: {chunk}')
+    print(f'divr:: {divr}')
+    print (f'divi:: {divi}')
+    out = []
+    #out=""
+    s=0
+    e=chunk
+    print(f'e:: {e}')
+    new_history=""
+    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
+    for z in range(divi):
+        print(f's:e :: {s}:{e}')
+        hist = history[s:e]
+        resp = run_gpt(
+            COMPRESS_DATA_PROMPT,
+            stop_tokens=["observation:", "task:", "action:", "thought:"],
+            max_tokens=8192,
+            seed=seed,
+            direction=instruct,
+            knowledge=new_history,
+            history=hist,
+        )
+        new_history = resp
+        print (resp)
+        out+=resp
+        e=e+chunk
+        s=s+chunk
+    '''
+    resp = run_gpt(
+        COMPRESS_DATA_PROMPT,
+        stop_tokens=["observation:", "task:", "action:", "thought:"],
+        max_tokens=8192,
+        seed=seed,
+        direction=instruct,
+        knowledge=new_history,
+        history="All data has been recieved.",
+    )'''
+    print ("final" + resp)
+    #history = "observation: {}\n".format(resp)
+    return resp
+def summarize(
+    inp: str,
+    history: list,
+    report_check: bool,
+    sum_mem_check: str,
+    data: str = None,
+    files: list = None,
+    url: str = None,
+    pdf_url: str = None,
+    pdf_batch: str = None
+) -> str:
+    """
+    Summarizes the provided input data, processes files, URLs, and PDFs, and yields the results.
+    Parameters:
+    - inp (str): The input data to be processed. If empty, defaults to "Process this data".
+    - history (list): A list to keep track of the conversation history.
+    - report_check (bool): A flag indicating whether to return a report.
+    - sum_mem_check (str): A string indicating whether to summarize or save memory.
+    - data (str, optional): Additional data to process. Defaults to None.
+    - files (list, optional): A list of file paths to process. Defaults to None.
+    - url (str, optional): A URL to fetch data from. Defaults to None.
+    - pdf_url (str, optional): A URL pointing to a PDF file to read. Defaults to None.
+    - pdf_batch (str, optional): A batch of PDF URLs (comma-separated) to read. Defaults to None.
+    Yields:
+    - A tuple containing:
+        - An empty string (for future use).
+        - The updated history list.
+        - An error box (if any errors occurred).
+        - A JSON box for structured output.
+    The function processes the input data, reads from specified URLs, PDFs, and files, and summarizes or saves the data based on the provided parameters.
+    """
+    json_box = []
+    rawp = ""
+    json_out = None
+    if inp == "":
+        inp = "Process this data"
+    history.clear()
+    history = [(inp, "Working on it...")]
+    yield "", history, error_box, json_box
+    # Process PDF batch URLs
+    if pdf_batch and pdf_batch.startswith("http"):
+        c = pdf_batch.count(",") + 1  # Count the number of URLs
+        data = ""
         try:
+            for i in range(c):
+                batch_url = pdf_batch.split(",", c)[i]
+                bb = read_pdf_online(batch_url)
+                data = f'{data}\nFile Name URL ({batch_url}):\n{bb}'
         except Exception as e:
+            print(e)
+    # Process single PDF URL
+    if pdf_url and pdf_url.startswith("http"):
+        print("PDF_URL")
+        out = read_pdf_online(pdf_url)
+        data = out
+    # Process regular URL
+    if url and url.startswith("http"):
+        val, out = find_all(inp, "", history, url, "")  # Add missing arguments
+        if not val:
+            data = "Error"
+            rawp = str(out)  # Assign rawp here
+        else:
+            data = out
+    # Process uploaded files
+    if files:
+        for i, file in enumerate(files):
+            try:
+                print(file)
+                if file.endswith(".pdf"):
+                    zz = read_pdf(file)
+                    print(zz)
+                    data = f'{data}\nFile Name ({file}):\n{zz}'
+                elif file.endswith(".txt"):
+                    zz = read_txt(file)
+                    print(zz)
+                    data = f'{data}\nFile Name ({file}):\n{zz}'
+            except Exception as e:
+                data = f'{data}\nError opening File Name ({file})'
+                print(e)
+    # Process the collected data
+    if data != "Error" and data != "":
+        print(inp)
+        out = str(data)
+        rl = len(out)
+        print(f'rl:: {rl}')
+        c = sum(1 for i in str(out) if i in [" ", ",", "\n"])  # Count delimiters
+        print(f'c:: {c}')
+        if sum_mem_check == "Memory":
+            json_out = save_memory(inp, out)
+            rawp = "Complete"  # Assign rawp here
+        if sum_mem_check == "Summarize":
+            json_out = compress_data(c, inp, out)
+            out = str(json_out)
+            if report_check:
+                rl = len(out)
+                print(f'rl:: {rl}')
+                c = sum(1 for i in str(out) if i in [" ", ",", "\n"])  # Count delimiters
+                print(f'c2:: {c}')
+                rawp = compress_data_og(c, inp, out)  # Assign rawp here
             else:
+                rawp = out  # Assign rawp here
+    else:
+        rawp = "Provide a valid data source"  # Assign rawp here
+    history.clear()
+    history.append((inp, rawp))
+    yield "", history, error_box, json_out
+SAVE_MEMORY = """
+You are attempting to complete the task
+task: {task}
+Data:
+{history}
+Instructions:
+Compile and categorize the data above into a JSON dictionary string
+Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
+Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else.
+Required keys:
+"keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"]
+"title":"title of entry"
+"description":"A sentence summarizing the topic of this entry"
+"content":"A brief paragraph summarizing the important datapoints found in this entry"
+"url":"https://url.source"
+"""
+def save_memory(purpose, history):
+    uid=uuid.uuid4()
+    history=str(history)
+    c=1
+    inp = str(history)
+    rl = len(inp)
+    print(f'rl:: {rl}')
+    for i in str(inp):
+        if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<":
+            c +=1
+    print (f'c:: {c}')
+    seed=random.randint(1,1000000000)
+    print (c)
+    #tot=len(purpose)
+    #print(tot)
+    divr=int(c)/MAX_DATA
+    divi=int(divr)+1 if divr != int(divr) else int(divr)
+    chunk = int(int(c)/divr)
+    print(f'chunk:: {chunk}')
+    print(f'divr:: {divr}')
+    print (f'divi:: {divi}')
+    out_box = []
+    #out=""
+    s=0
+    ee=chunk
+    print(f'e:: {ee}')
+    new_history=""
+    task = f'Index this Data\n'
+    for z in range(divi):
+        print(f's:e :: {s}:{ee}')
+        hist = inp[s:ee]
+        resp = run_gpt(
+            SAVE_MEMORY,
+            stop_tokens=["observation:", "task:", "action:", "thought:"],
+            max_tokens=4096,
+            seed=seed,
+            purpose=purpose,
+            task=task,
+            history=hist,
+        ).strip('\n')
+        #new_history = resp
+        #print (resp)
+        #out+=resp
+        #print ("final1" + resp)
         try:
+            resp='[{'+resp.split('[{')[1].split('</s>')[0]
+            #print ("final2\n" + resp)
+            #print(f"keywords:: {resp['keywords']}")
+        except Exception as e:
+            resp = resp
+            print(e)
+        timestamp=str(datetime.datetime.now())
+        timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
+        json_object=resp
+        #json_object = json.dumps(out_box)
+        #json_object = json.dumps(out_box,indent=4)
+        with open(f"tmp-{uid}.json", "w") as outfile:
+            outfile.write(json_object)
+        outfile.close()
+        api.upload_file(
+        path_or_fileobj=f"tmp-{uid}.json",
+        path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json",
+        repo_id=reponame,
+        #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
+        token=token_self,
+        repo_type="dataset",
+        )
+        lines = resp.strip().strip("\n").split("\n")
+        r = requests.get(f'{save_data}mem-test2/main.json')
+        print(f'status code main:: {r.status_code}')
+        if r.status_code==200:
+            lod = json.loads(r.text)
+            #lod = eval(lod)
+            print (f'lod:: {lod}')
+        if not r.status_code==200:
+            lod = []
+        for i,line in enumerate(lines):
+            key_box=[]
+            print(f'LINE:: {line}')
+            if ":" in line:
+                print(f'line:: {line}')
+            if "keywords" in line:
+                print(f'trying:: {line}')
+                keyw=line.split(":")[1]
+                print (keyw)
+                print (keyw.split("[")[1].split("]")[0])
+                keyw=keyw.split("[")[1].split("]")[0]
+                for ea in keyw.split(","):
+                    s1=""
+                    ea=ea.strip().strip("\n")
+                    for ev in ea:
+                        if ev.isalnum():
+                            s1+=ev
+                        if ev == " ":
+                            s1+=ev
+                        #ea=s1
+                    print(s1)
+                    key_box.append(s1)
+                lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"index":f"{s}:{ee}"})
+                json_object = json.dumps(lod, indent=4)
+                with open(f"tmp2-{uid}.json", "w") as outfile2:
+                    outfile2.write(json_object)
+                outfile2.close()
+                api.upload_file(
+                path_or_fileobj=f"tmp2-{uid}.json",
+                path_in_repo=f"/mem-test2/main.json",
+                repo_id=reponame,
+                #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
+                token=token_self,
+                repo_type="dataset",
                 )
+        ee=ee+chunk
+        s=s+chunk
+        out_box.append(resp)
+    return out_box
+def create_zip_file(output_data, zip_name):
+    with zipfile.ZipFile(zip_name, 'w') as zipf:
+        for i, data in enumerate(output_data):
+            zipf.writestr(f'data_{i}.txt', data)
+    return zip_name
+def clear_fn():
+    return "", [(None, None)]
+with gr.Blocks() as app:
+    gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3></center>""")
+    # Main chat interface
+    chatbot = gr.Chatbot(
+        label="Mixtral 8x7B Chatbot",
+        show_copy_button=True,
+        type='messages',
+        height=400,
+        purpose_input = gr.Textbox(label="Purpose"),
+        task_input = gr.Textbox(label="Task"),
+        history_input = gr.Textbox(label="History"),
+        url_input = gr.Textbox(label="URL"),
+        result_input = gr.Textbox(label="Result"),
+        steps_input = gr.Number(label="Steps", value=3),  # Default value of 3 steps
+        output_component = gr.Textbox(label="Output"),
+        button = gr.Button("Search"),
+    )
+    # Control Panel
+    with gr.Row():
+        with gr.Column(scale=3):
+            prompt = gr.Textbox(
+                label="Instructions (optional)",
+                placeholder="Enter processing instructions here..."
             )
+            steps = gr.Slider(
+                label="Crawl Steps",
+                minimum=1,
+                maximum=5,
+                value=1,
+                info="Number of levels to crawl for web content"
+            )
+        with gr.Column(scale=1):
+            report_check = gr.Checkbox(
+                label="Return Report",
+                value=True,
+                info="Generate detailed analysis report"
+            )
+            sum_mem_check = gr.Radio(
+                label="Output Type",
+                choices=["Summary", "Memory"],
+                value="Summary",
+                info="Choose between summarized or memory-based output"
+            )
+            button = gr.Button("Process", variant="primary")
+    # Clear button
+    with gr.Row():
+        clear_btn = gr.Button("Clear", variant="secondary")
+    # Input Tabs
+    with gr.Tabs() as input_tabs:
+        with gr.Tab("📝 Text"):
+            data = gr.Textbox(
+                label="Input Data",
+                lines=6,
+                placeholder="Paste your text here..."
             )
+        with gr.Tab("📁 File"):
+            files = gr.File(
+                label="Upload Files",
+                file_types=[".pdf", ".txt"],
+                file_count="multiple"
             )
+        with gr.Tab("🌐 Web URL"):
+            url = gr.Textbox(
+                label="Website URL",
+                placeholder="https://example.com"
             )
+        with gr.Tab("📄 PDF URL"):
+            pdf_url = gr.Textbox(
+                label="PDF URL",
+                placeholder="https://example.com/document.pdf"
             )
+        with gr.Tab("📚 PDF Batch"):
+            pdf_batch = gr.Textbox(
+                label="PDF URLs (comma separated)",
+                placeholder="url1.pdf, url2.pdf, url3.pdf"
             )
+    # Output Section
+    with gr.Row():
+        with gr.Column():
+            json_out = gr.JSON(
+                label="Structured Output",
+                show_label=True
+            )
+        with gr.Column():
+            e_box = gr.Textbox(
+                label="Status & Errors",
+                interactive=False
             )
+    def process_and_format_response(instructions, chat_history, report, summary_memory,
+                                  input_data, uploaded_files, input_url, pdf_input_url):  # Removed extra parameters
+        try:
+            # Process the inputs with reduced parameters
+            result = None
+            for _ in summarize(
+                instructions,
+                chat_history if chat_history else [],
+                report,
+                summary_memory,
+                input_data,
+                uploaded_files,
+                input_url,
+                pdf_input_url  # Removed extra parameters
+            ):
+                result = _
+            if result:
+                _, history, errors, json_data = result
+                # Convert history to ChatMessage format
+                formatted_messages = []
+                if isinstance(history, list):
+                    for msg in history:
+                        if isinstance(msg, tuple) and len(msg) == 2:
+                            formatted_messages.extend([
+                                gr.ChatMessage(content=str(msg[0]), role="user"),
+                                gr.ChatMessage(content=str(msg[1]), role="assistant")
+                            ])
+                else:
+                    formatted_messages.extend([
+                        gr.ChatMessage(content=str(instructions), role="user"),
+                        gr.ChatMessage(content=str(history), role="assistant")
+                    ])
+                # Format error messages
+                error_message = "\n".join(errors) if errors else "Processing completed successfully"
+                return (
+                    "",  # Clear the prompt
+                    formatted_messages,
+                    error_message,
+                    json_data
+                )
+        except Exception as e:
+            error_msg = f"Error: {str(e)}"
+            return (
+                "",
+                [
+                    gr.ChatMessage(content=str(instructions), role="user"),
+                    gr.ChatMessage(content=error_msg, role="assistant")
+                ],
+                error_msg,
+                None
+            )
+    def clear_fn():
+        return "", []
+    # Update the button click event to match parameters
+    button.click(
+        find_all,
+        inputs=[
+            purpose_input,    # Add these input components to your Gradio interface
+            task_input,
+            history_input,
+            url_input,
+            result_input,
+            steps_input
+        ],
+        outputs=[output_component]
+    )
+    # Launch the app
+    app.queue(default_concurrency_limit=20).launch(
+        show_api=False,
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860
+)