import gradio as gr import requests import zipfile import uuid import bs4 import lxml import os from huggingface_hub import InferenceClient, HfApi import random import json import datetime from pypdf import PdfReader from agent import ( PREFIX, COMPRESS_DATA_PROMPT, COMPRESS_DATA_PROMPT_SMALL, LOG_PROMPT, LOG_RESPONSE, ) # Initialize Hugging Face client client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") reponame = "acecalisto3/tmp" save_data = f'https://huggingface.co/datasets/{reponame}/raw/main/' # Get HF token from environment or use demo mode token_self = os.environ.get('HF_TOKEN', 'dummy_token') # Use dummy token for demo if token_self == 'dummy_token': print("Warning: Running in demo mode without HuggingFace token. Some features may be limited.") api = HfApi(token=token_self) # Constants VERBOSE = True MAX_HISTORY = 100 MAX_DATA = 20000 def find_all(purpose, task, history, url, result, steps): return_list = [] visited_links = set() links_to_visit = [(url, 0)] while links_to_visit: current_url, current_depth = links_to_visit.pop(0) if current_depth < steps: try: if current_url not in visited_links: visited_links.add(current_url) source = requests.get(current_url) if source.status_code == 200: soup = bs4.BeautifulSoup(source.content, 'lxml') rawp = f'RAW TEXT RETURNED: {soup.text}' return_list.append(rawp) for link in soup.find_all("a"): href = link.get('href') if href and href.startswith('http'): links_to_visit.append((href, current_depth + 1)) except Exception as e: print(f"Error fetching {current_url}: {e}") return True, return_list def read_txt(txt_path): with open(txt_path, "r") as f: text = f.read() return text def read_pdf(pdf_path): text = "" reader = PdfReader(pdf_path) for page in reader.pages: text = f'{text}\n{page.extract_text()}' return text error_box = [] def read_pdf_online(url): print(f"reading {url}") response = requests.get(url, stream=True) if response.status_code == 200: with open("test.pdf", "wb") as f: f.write(response.content) reader = PdfReader("test.pdf") text = "" for page in reader.pages: text = f'{text}\n{page.extract_text()}' return text else: error_box.append(url) return str(response.status_code) def format_prompt(message, history): prompt = "" for user_prompt, bot_response in history: prompt += f"[INST] {user_prompt} [/INST]" prompt += f" {bot_response} " prompt += f"[INST] {message} [/INST]" return prompt def run_gpt(prompt_template, stop_tokens, max_tokens, seed, **prompt_kwargs): timestamp = datetime.datetime.now() generate_kwargs = dict( temperature=0.9, max_new_tokens=max_tokens, top_p=0.95, repetition_penalty=1.0, do_sample=True, seed=seed, ) content = PREFIX.format( timestamp=timestamp, purpose="Compile the provided data and complete the users task" ) + prompt_template.format(**prompt_kwargs) if VERBOSE: print(LOG_PROMPT.format(content)) stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False) resp = "" for response in stream: resp += response.token.text if VERBOSE: print(LOG_RESPONSE.format(resp)) return resp def compress_data(c, instruct, history): seed = random.randint(1, 1000000000) divr = int(c)/MAX_DATA divi = int(divr)+1 if divr != int(divr) else int(divr) chunk = int(int(c)/divr) out = [] s = 0 e = chunk for z in range(divi): hist = history[s:e] resp = run_gpt( COMPRESS_DATA_PROMPT_SMALL, stop_tokens=["observation:", "task:", "action:", "thought:"], max_tokens=8192, seed=seed, direction=instruct, knowledge="", history=hist, ) out.append(resp) e = e+chunk s = s+chunk return out def create_zip_file(output_data, zip_name): with zipfile.ZipFile(zip_name, 'w') as zipf: for i, data in enumerate(output_data): zipf.writestr(f'data_{i}.txt', data) return zip_name def process_and_format_response(instructions, chat_history, report, summary_memory, input_data, uploaded_files, input_url, pdf_input_url): try: # Process URL if provided if input_url: success, content = find_all("Extract content", "", [], input_url, "", 1) if success and content: processed_text = "\n".join(content) else: return "", [["Error", "Failed to fetch URL content"]], "URL processing failed", None # Process uploaded files elif uploaded_files: processed_text = "" for file in uploaded_files: if file.name.endswith('.pdf'): processed_text += read_pdf(file.name) + "\n\n" elif file.name.endswith('.txt'): processed_text += read_txt(file.name) + "\n\n" # Process direct text input elif input_data: processed_text = input_data else: return "", [["Error", "No input provided"]], "No input data", None # Generate summary using compress_data if processed_text: c = len(processed_text.split()) summary = compress_data(c, instructions or "Summarize this text", processed_text) # Format the response if isinstance(summary, list): summary_text = "\n".join(summary) else: summary_text = str(summary) # Create chat messages messages = [ ["Input", processed_text[:500] + "..."], # Show first 500 chars of input ["Summary", summary_text] ] # Create JSON output json_output = { "input_length": len(processed_text), "summary_length": len(summary_text), "summary": summary_text } return "", messages, "Processing completed successfully", json_output except Exception as e: error_msg = f"Error: {str(e)}" return "", [["Error", error_msg]], error_msg, None def clear_fn(): return "", [] # Create Gradio interface with gr.Blocks() as app: gr.HTML("""

Mixtral 8x7B TLDR Summarizer + Web

Summarize Data of unlimited length

""") # Main chat interface with gr.Row(): chatbot = gr.Chatbot( label="Mixtral 8x7B Chatbot", show_copy_button=True, height=400 ) # Control Panel with gr.Row(): with gr.Column(scale=3): prompt = gr.Textbox( label="Instructions", placeholder="Enter processing instructions here..." ) steps = gr.Slider( label="Crawl Steps", minimum=1, maximum=5, value=1, info="Number of levels to crawl for web content" ) with gr.Column(scale=1): report_check = gr.Checkbox( label="Return Report", value=True, info="Generate detailed analysis report" ) sum_mem_check = gr.Radio( label="Output Type", choices=["Summary", "Memory"], value="Summary", info="Choose between summarized or memory-based output" ) process_btn = gr.Button("Process", variant="primary") # Input Tabs with gr.Tabs() as input_tabs: with gr.Tab("📝 Text"): text_input = gr.Textbox( label="Input Text", lines=6, placeholder="Paste your text here..." ) with gr.Tab("📁 File"): file_input = gr.File( label="Upload Files", file_types=[".pdf", ".txt"], file_count="multiple" ) with gr.Tab("🌐 Web URL"): url_input = gr.Textbox( label="Website URL", placeholder="https://example.com" ) with gr.Tab("📄 PDF URL"): pdf_url_input = gr.Textbox( label="PDF URL", placeholder="https://example.com/document.pdf" ) # Output Section with gr.Row(): with gr.Column(): json_output = gr.JSON( label="Structured Output", show_label=True ) with gr.Column(): error_output = gr.Textbox( label="Status & Errors", interactive=False ) # Event handlers process_btn.click( process_and_format_response, inputs=[ prompt, chatbot, report_check, sum_mem_check, text_input, file_input, url_input, pdf_url_input ], outputs=[ prompt, chatbot, error_output, json_output ] ) # Launch the app app.queue(default_concurrency_limit=20).launch( show_api=False, share=False server_name="0.0.0.0", server_port=8000 )