Spaces:

acecalisto3
/

urld

Running

File size: 22,157 Bytes

import gradio as gr
#import urllib.request
import requests
import zipfile
import uuid
import bs4
import lxml
import os
#import subprocess
from huggingface_hub import InferenceClient,HfApi
import random
import json
import datetime
from pypdf import PdfReader
import uuid
#from query import tasks
from agent import (
    PREFIX,
    COMPRESS_DATA_PROMPT,
    COMPRESS_DATA_PROMPT_SMALL,
    LOG_PROMPT,
    LOG_RESPONSE,
)
client = InferenceClient(
    "mistralai/Mixtral-8x7B-Instruct-v0.1"
)
reponame="acecalisto3/tmp"
save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
token_self = os.environ['HF_TOKEN']
api=HfApi(token=token_self)

def find_all(purpose, task, history, url, result, steps):
    return_list = []
    visited_links = set()
    links_to_visit = [(url, 0)]

    while links_to_visit:
        current_url, current_depth = links_to_visit.pop(0)
        if current_depth < steps:
            try:
                if current_url not in visited_links:
                    visited_links.add(current_url)
                    source = requests.get(current_url)
                    if source.status_code == 200:
                        soup = bs4.BeautifulSoup(source.content, 'lxml')
                        rawp = f'RAW TEXT RETURNED: {soup.text}'
                        return_list.append(rawp)

                        for link in soup.find_all("a"):
                            href = link.get('href')
                            if href and href.startswith('http'):
                                links_to_visit.append((href, current_depth + 1))
            except Exception as e:
                print(f"Error fetching {current_url}: {e}")

    return True, return_list

def read_txt(txt_path):
    text=""
    with open(txt_path,"r") as f:
        text = f.read()
    f.close()
    print (text)
    return text

def read_pdf(pdf_path):
    text=""
    reader = PdfReader(f'{pdf_path}')
    number_of_pages = len(reader.pages)
    for i in range(number_of_pages):
        page = reader.pages[i]
        text = f'{text}\n{page.extract_text()}'
    print (text)
    return text

error_box=[]
def read_pdf_online(url):
    uid=uuid.uuid4()
    print(f"reading {url}")
    response = requests.get(url, stream=True)
    print(response.status_code)
    text=""
#################
    
#####################
    try:
        if response.status_code == 200:
            with open("test.pdf", "wb") as f:
                f.write(response.content)
            #f.close()
            #out = Path("./data.pdf")
            #print (out)
            reader = PdfReader("test.pdf")
            number_of_pages = len(reader.pages)
            print(number_of_pages)
            for i in range(number_of_pages):
                page = reader.pages[i]
                text = f'{text}\n{page.extract_text()}'
                print(f"PDF_TEXT:: {text}")
            return text
        else:
            text = response.status_code
            error_box.append(url)
            print(text)
            return text


    except Exception as e:
        print (e)
        return e


VERBOSE = True
MAX_HISTORY = 100
MAX_DATA = 20000

def format_prompt(message, history):
  prompt = "<s>"
  for user_prompt, bot_response in history:
    prompt += f"[INST] {user_prompt} [/INST]"
    prompt += f" {bot_response}</s> "
  prompt += f"[INST] {message} [/INST]"
  return prompt



def run_gpt(
    prompt_template,
    stop_tokens,
    max_tokens,
    seed,
    **prompt_kwargs,
):
    print(seed)
    timestamp=datetime.datetime.now()
    
    generate_kwargs = dict(
        temperature=0.9,
        max_new_tokens=max_tokens,
        top_p=0.95,
        repetition_penalty=1.0,
        do_sample=True,
        seed=seed,
    )
    
    content = PREFIX.format(
        timestamp=timestamp,
        purpose="Compile the provided data and complete the users task"
    ) + prompt_template.format(**prompt_kwargs)
    if VERBOSE:
        print(LOG_PROMPT.format(content))
    
    
    #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
    #formatted_prompt = format_prompt(f'{content}', history)

    stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
    resp = ""
    for response in stream:
        resp += response.token.text
        #yield resp

    if VERBOSE:
        print(LOG_RESPONSE.format(resp))
    return resp

    
def compress_data(c, instruct, history):
    seed=random.randint(1,1000000000)
    
    print (c)
    #tot=len(purpose)
    #print(tot)
    divr=int(c)/MAX_DATA
    divi=int(divr)+1 if divr != int(divr) else int(divr)
    chunk = int(int(c)/divr)
    print(f'chunk:: {chunk}')
    print(f'divr:: {divr}')
    print (f'divi:: {divi}')
    out = []
    #out=""
    s=0
    e=chunk
    print(f'e:: {e}')
    new_history=""
    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
    for z in range(divi):
        print(f's:e :: {s}:{e}')
        
        hist = history[s:e]
        
        resp = run_gpt(
            COMPRESS_DATA_PROMPT_SMALL,
            stop_tokens=["observation:", "task:", "action:", "thought:"],
            max_tokens=8192,
            seed=seed,
            direction=instruct,
            knowledge="",
            history=hist,
        )
        out.append(resp)
        #new_history = resp
        print (resp)
        #out+=resp
        e=e+chunk
        s=s+chunk
    return out

    
def compress_data_og(c, instruct, history):
    seed=random.randint(1,1000000000)
    
    print (c)
    #tot=len(purpose)
    #print(tot)
    divr=int(c)/MAX_DATA
    divi=int(divr)+1 if divr != int(divr) else int(divr)
    chunk = int(int(c)/divr)
    print(f'chunk:: {chunk}')
    print(f'divr:: {divr}')
    print (f'divi:: {divi}')
    out = []
    #out=""
    s=0
    e=chunk
    print(f'e:: {e}')
    new_history=""
    #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
    for z in range(divi):
        print(f's:e :: {s}:{e}')
        
        hist = history[s:e]
        
        resp = run_gpt(
            COMPRESS_DATA_PROMPT,
            stop_tokens=["observation:", "task:", "action:", "thought:"],
            max_tokens=8192,
            seed=seed,
            direction=instruct,
            knowledge=new_history,
            history=hist,
        )
        
        new_history = resp
        print (resp)
        out+=resp
        e=e+chunk
        s=s+chunk
    '''
    resp = run_gpt(
        COMPRESS_DATA_PROMPT,
        stop_tokens=["observation:", "task:", "action:", "thought:"],
        max_tokens=8192,
        seed=seed,
        direction=instruct,
        knowledge=new_history,
        history="All data has been recieved.",
    )'''
    print ("final" + resp)
    #history = "observation: {}\n".format(resp)
    return resp



def summarize(
    inp: str,
    history: list,
    report_check: bool,
    sum_mem_check: str,
    data: str = None,
    files: list = None,
    url: str = None,
    pdf_url: str = None,
    pdf_batch: str = None
) -> str:
    """
    Summarizes the provided input data, processes files, URLs, and PDFs, and yields the results.

    Parameters:
    - inp (str): The input data to be processed. If empty, defaults to "Process this data".
    - history (list): A list to keep track of the conversation history.
    - report_check (bool): A flag indicating whether to return a report.
    - sum_mem_check (str): A string indicating whether to summarize or save memory.
    - data (str, optional): Additional data to process. Defaults to None.
    - files (list, optional): A list of file paths to process. Defaults to None.
    - url (str, optional): A URL to fetch data from. Defaults to None.
    - pdf_url (str, optional): A URL pointing to a PDF file to read. Defaults to None.
    - pdf_batch (str, optional): A batch of PDF URLs (comma-separated) to read. Defaults to None.

    Yields:
    - A tuple containing:
        - An empty string (for future use).
        - The updated history list.
        - An error box (if any errors occurred).
        - A JSON box for structured output.

    The function processes the input data, reads from specified URLs, PDFs, and files, and summarizes or saves the data based on the provided parameters.
    """
    json_box = []
    rawp = ""
    json_out = None

    if inp == "":
        inp = "Process this data"
    
    history.clear()
    history = [(inp, "Working on it...")]
    yield "", history, error_box, json_box

    # Process PDF batch URLs
    if pdf_batch and pdf_batch.startswith("http"):
        c = pdf_batch.count(",") + 1  # Count the number of URLs
        data = ""
        try:
            for i in range(c):
                batch_url = pdf_batch.split(",", c)[i]
                bb = read_pdf_online(batch_url)
                data = f'{data}\nFile Name URL ({batch_url}):\n{bb}'
        except Exception as e:
            print(e)

    # Process single PDF URL
    if pdf_url and pdf_url.startswith("http"):
        print("PDF_URL")
        out = read_pdf_online(pdf_url)
        data = out

    # Process regular URL
    if url and url.startswith("http"):
        val, out = find_all(inp, "", history, url, "")  # Add missing arguments
        if not val:
            data = "Error"
            rawp = str(out)  # Assign rawp here
        else:
            data = out

    # Process uploaded files
    if files:
        for i, file in enumerate(files):
            try:
                print(file)
                if file.endswith(".pdf"):
                    zz = read_pdf(file)
                    print(zz)
                    data = f'{data}\nFile Name ({file}):\n{zz}'
                elif file.endswith(".txt"):
                    zz = read_txt(file)
                    print(zz)
                    data = f'{data}\nFile Name ({file}):\n{zz}'
            except Exception as e:
                data = f'{data}\nError opening File Name ({file})'
                print(e)

    # Process the collected data
    if data != "Error" and data != "":
        print(inp)
        out = str(data)
        rl = len(out)
        print(f'rl:: {rl}')
        c = sum(1 for i in str(out) if i in [" ", ",", "\n"])  # Count delimiters
        print(f'c:: {c}')

        if sum_mem_check == "Memory":
            json_out = save_memory(inp, out)
            rawp = "Complete"  # Assign rawp here

        if sum_mem_check == "Summarize":
            json_out = compress_data(c, inp, out)
            out = str(json_out)

            if report_check:
                rl = len(out)
                print(f'rl:: {rl}')
                c = sum(1 for i in str(out) if i in [" ", ",", "\n"])  # Count delimiters
                print(f'c2:: {c}')
                rawp = compress_data_og(c, inp, out)  # Assign rawp here
            else:
                rawp = out  # Assign rawp here
    else:
        rawp = "Provide a valid data source"  # Assign rawp here

    history.clear()
    history.append((inp, rawp))
    yield "", history, error_box, json_out
SAVE_MEMORY = """
You are attempting to complete the task
task: {task}
Data:
{history}
Instructions:
Compile and categorize the data above into a JSON dictionary string
Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else.
Required keys:
"keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"]
"title":"title of entry"
"description":"A sentence summarizing the topic of this entry"
"content":"A brief paragraph summarizing the important datapoints found in this entry"
"url":"https://url.source"
"""

def save_memory(purpose, history):
    uid=uuid.uuid4()
    history=str(history)
    c=1
    inp = str(history)
    rl = len(inp)
    print(f'rl:: {rl}')
    for i in str(inp):
        if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<":
            c +=1
    print (f'c:: {c}')

    seed=random.randint(1,1000000000)
    
    print (c)
    #tot=len(purpose)
    #print(tot)
    divr=int(c)/MAX_DATA
    divi=int(divr)+1 if divr != int(divr) else int(divr)
    chunk = int(int(c)/divr)
    print(f'chunk:: {chunk}')
    print(f'divr:: {divr}')
    print (f'divi:: {divi}')
    out_box = []
    #out=""
    s=0
    ee=chunk
    print(f'e:: {ee}')
    new_history=""
    task = f'Index this Data\n'
    for z in range(divi):
        print(f's:e :: {s}:{ee}')
        
        hist = inp[s:ee]
        
        resp = run_gpt(
            SAVE_MEMORY,
            stop_tokens=["observation:", "task:", "action:", "thought:"],
            max_tokens=4096,
            seed=seed,
            purpose=purpose,
            task=task,
            history=hist,
        ).strip('\n')
        #new_history = resp
        #print (resp)
        #out+=resp

        #print ("final1" + resp)
        try:
            resp='[{'+resp.split('[{')[1].split('</s>')[0]
            #print ("final2\n" + resp)
            #print(f"keywords:: {resp['keywords']}")
        except Exception as e:
            resp = resp
            print(e)
        timestamp=str(datetime.datetime.now())
        timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
        json_object=resp
        #json_object = json.dumps(out_box)
        #json_object = json.dumps(out_box,indent=4)
        with open(f"tmp-{uid}.json", "w") as outfile:
            outfile.write(json_object)
            
        outfile.close()
        api.upload_file(
        path_or_fileobj=f"tmp-{uid}.json",
        path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json",
        repo_id=reponame,
        #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
        token=token_self,
        repo_type="dataset",
        )
        lines = resp.strip().strip("\n").split("\n")
        r = requests.get(f'{save_data}mem-test2/main.json') 
        print(f'status code main:: {r.status_code}')
        if r.status_code==200:
            
            lod = json.loads(r.text)
            #lod = eval(lod)
            print (f'lod:: {lod}')
        if not r.status_code==200:
            lod = []
        for i,line in enumerate(lines):
            key_box=[]
            print(f'LINE:: {line}')
            if ":" in line:
                print(f'line:: {line}')
            
            if "keywords" in line:
                print(f'trying:: {line}')
                keyw=line.split(":")[1]
                print (keyw)
                print (keyw.split("[")[1].split("]")[0])
                keyw=keyw.split("[")[1].split("]")[0]
                for ea in keyw.split(","):
                    s1=""
                    ea=ea.strip().strip("\n")
                    for ev in ea:
                        if ev.isalnum():
                            s1+=ev
                        if ev == " ":
                            s1+=ev
                        #ea=s1
                    print(s1)
                    key_box.append(s1)
                lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"index":f"{s}:{ee}"})
                json_object = json.dumps(lod, indent=4)
                with open(f"tmp2-{uid}.json", "w") as outfile2:
                    outfile2.write(json_object)
                outfile2.close()
                api.upload_file(
                path_or_fileobj=f"tmp2-{uid}.json",
                path_in_repo=f"/mem-test2/main.json",
                repo_id=reponame,
                #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
                token=token_self,
                repo_type="dataset",
                )
        ee=ee+chunk
        s=s+chunk       
        out_box.append(resp)
    return out_box

def create_zip_file(output_data, zip_name):
    with zipfile.ZipFile(zip_name, 'w') as zipf:
        for i, data in enumerate(output_data):
            zipf.writestr(f'data_{i}.txt', data)
    return zip_name


    
def clear_fn():
    return "", [(None, None)]

with gr.Blocks() as app:
    gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3></center>""")
    
    # Main chat interface
    chatbot = gr.Chatbot(
        label="Mixtral 8x7B Chatbot", 
        show_copy_button=True, 
        type='messages',
        height=400,
        purpose_input = gr.Textbox(label="Purpose"),
        task_input = gr.Textbox(label="Task"),
        history_input = gr.Textbox(label="History"),
        url_input = gr.Textbox(label="URL"),
        result_input = gr.Textbox(label="Result"),
        steps_input = gr.Number(label="Steps", value=3),  # Default value of 3 steps
        output_component = gr.Textbox(label="Output"),
        button = gr.Button("Search"),
    )
    
    # Control Panel
    with gr.Row():
        with gr.Column(scale=3):
            prompt = gr.Textbox(
                label="Instructions (optional)",
                placeholder="Enter processing instructions here..."
            )
            steps = gr.Slider(
                label="Crawl Steps", 
                minimum=1, 
                maximum=5, 
                value=1,
                info="Number of levels to crawl for web content"
            )
        with gr.Column(scale=1):
            report_check = gr.Checkbox(
                label="Return Report", 
                value=True,
                info="Generate detailed analysis report"
            )
            sum_mem_check = gr.Radio(
                label="Output Type", 
                choices=["Summary", "Memory"], 
                value="Summary",
                info="Choose between summarized or memory-based output"
            )
            button = gr.Button("Process", variant="primary")
    
    # Clear button
    with gr.Row():
        clear_btn = gr.Button("Clear", variant="secondary")
    
    # Input Tabs
    with gr.Tabs() as input_tabs:
        with gr.Tab("📝 Text"):
            data = gr.Textbox(
                label="Input Data", 
                lines=6,
                placeholder="Paste your text here..."
            )
        with gr.Tab("📁 File"):
            files = gr.File(
                label="Upload Files",
                file_types=[".pdf", ".txt"],
                file_count="multiple"
            )
        with gr.Tab("🌐 Web URL"):
            url = gr.Textbox(
                label="Website URL",
                placeholder="https://example.com"
            )
        with gr.Tab("📄 PDF URL"):
            pdf_url = gr.Textbox(
                label="PDF URL",
                placeholder="https://example.com/document.pdf"
            )
        with gr.Tab("📚 PDF Batch"):
            pdf_batch = gr.Textbox(
                label="PDF URLs (comma separated)",
                placeholder="url1.pdf, url2.pdf, url3.pdf"
            )
    
    # Output Section
    with gr.Row():
        with gr.Column():
            json_out = gr.JSON(
                label="Structured Output",
                show_label=True
            )
        with gr.Column():
            e_box = gr.Textbox(
                label="Status & Errors", 
                interactive=False
            )
        
    def process_and_format_response(instructions, chat_history, report, summary_memory, 
                                  input_data, uploaded_files, input_url, pdf_input_url):  # Removed extra parameters
        try:
            # Process the inputs with reduced parameters
            result = None
            for _ in summarize(
                instructions, 
                chat_history if chat_history else [], 
                report, 
                summary_memory, 
                input_data, 
                uploaded_files, 
                input_url, 
                pdf_input_url  # Removed extra parameters
            ):
                result = _
            
            if result:
                _, history, errors, json_data = result
                
                # Convert history to ChatMessage format
                formatted_messages = []
                if isinstance(history, list):
                    for msg in history:
                        if isinstance(msg, tuple) and len(msg) == 2:
                            formatted_messages.extend([
                                gr.ChatMessage(content=str(msg[0]), role="user"),
                                gr.ChatMessage(content=str(msg[1]), role="assistant")
                            ])
                else:
                    formatted_messages.extend([
                        gr.ChatMessage(content=str(instructions), role="user"),
                        gr.ChatMessage(content=str(history), role="assistant")
                    ])
                
                # Format error messages
                error_message = "\n".join(errors) if errors else "Processing completed successfully"
                
                return (
                    "",  # Clear the prompt
                    formatted_messages,
                    error_message,
                    json_data
                )
        except Exception as e:
            error_msg = f"Error: {str(e)}"
            return (
                "",
                [
                    gr.ChatMessage(content=str(instructions), role="user"),
                    gr.ChatMessage(content=error_msg, role="assistant")
                ],
                error_msg,
                None
            )
    
    def clear_fn():
        return "", []
    
    # Update the button click event to match parameters
    button.click(
        find_all,
        inputs=[
            purpose_input,    # Add these input components to your Gradio interface
            task_input,
            history_input,
            url_input,
            result_input,
            steps_input
        ],
        outputs=[output_component]
    )

    # Launch the app
    app.queue(default_concurrency_limit=20).launch(
        show_api=False,
        share=True,
        server_name="0.0.0.0",
        server_port=7860
)