acecalisto3 commited on
Commit
c00eec9
Β·
verified Β·
1 Parent(s): 4340847

Update app2.py

Browse files
Files changed (1) hide show
  1. app2.py +145 -532
app2.py CHANGED
@@ -1,19 +1,15 @@
1
  import gradio as gr
2
- #import urllib.request
3
  import requests
4
  import zipfile
5
  import uuid
6
  import bs4
7
  import lxml
8
  import os
9
- #import subprocess
10
- from huggingface_hub import InferenceClient,HfApi
11
  import random
12
  import json
13
  import datetime
14
  from pypdf import PdfReader
15
- import uuid
16
- #from query import tasks
17
  from agent import (
18
  PREFIX,
19
  COMPRESS_DATA_PROMPT,
@@ -21,13 +17,22 @@ from agent import (
21
  LOG_PROMPT,
22
  LOG_RESPONSE,
23
  )
24
- client = InferenceClient(
25
- "mistralai/Mixtral-8x7B-Instruct-v0.1"
26
- )
27
- reponame="acecalisto3/tmp"
28
- save_data=f'https://huggingface.co/datasets/{reponame}/raw/main/'
29
- token_self = os.environ['HF_TOKEN']
30
- api=HfApi(token=token_self)
 
 
 
 
 
 
 
 
 
31
 
32
  def find_all(purpose, task, history, url, result, steps):
33
  return_list = []
@@ -56,83 +61,43 @@ def find_all(purpose, task, history, url, result, steps):
56
  return True, return_list
57
 
58
  def read_txt(txt_path):
59
- text=""
60
- with open(txt_path,"r") as f:
61
  text = f.read()
62
- f.close()
63
- print (text)
64
  return text
65
 
66
  def read_pdf(pdf_path):
67
- text=""
68
- reader = PdfReader(f'{pdf_path}')
69
- number_of_pages = len(reader.pages)
70
- for i in range(number_of_pages):
71
- page = reader.pages[i]
72
  text = f'{text}\n{page.extract_text()}'
73
- print (text)
74
  return text
75
 
76
- error_box=[]
77
  def read_pdf_online(url):
78
- uid=uuid.uuid4()
79
  print(f"reading {url}")
80
  response = requests.get(url, stream=True)
81
- print(response.status_code)
82
- text=""
83
- #################
84
-
85
- #####################
86
- try:
87
- if response.status_code == 200:
88
- with open("test.pdf", "wb") as f:
89
- f.write(response.content)
90
- #f.close()
91
- #out = Path("./data.pdf")
92
- #print (out)
93
- reader = PdfReader("test.pdf")
94
- number_of_pages = len(reader.pages)
95
- print(number_of_pages)
96
- for i in range(number_of_pages):
97
- page = reader.pages[i]
98
- text = f'{text}\n{page.extract_text()}'
99
- print(f"PDF_TEXT:: {text}")
100
- return text
101
- else:
102
- text = response.status_code
103
- error_box.append(url)
104
- print(text)
105
- return text
106
-
107
-
108
- except Exception as e:
109
- print (e)
110
- return e
111
-
112
-
113
- VERBOSE = True
114
- MAX_HISTORY = 100
115
- MAX_DATA = 20000
116
 
117
  def format_prompt(message, history):
118
- prompt = "<s>"
119
- for user_prompt, bot_response in history:
120
- prompt += f"[INST] {user_prompt} [/INST]"
121
- prompt += f" {bot_response}</s> "
122
- prompt += f"[INST] {message} [/INST]"
123
- return prompt
124
-
125
-
126
-
127
- def run_gpt(
128
- prompt_template,
129
- stop_tokens,
130
- max_tokens,
131
- seed,
132
- **prompt_kwargs,
133
- ):
134
- print(seed)
135
- timestamp=datetime.datetime.now()
136
 
137
  generate_kwargs = dict(
138
  temperature=0.9,
@@ -147,48 +112,30 @@ def run_gpt(
147
  timestamp=timestamp,
148
  purpose="Compile the provided data and complete the users task"
149
  ) + prompt_template.format(**prompt_kwargs)
 
150
  if VERBOSE:
151
  print(LOG_PROMPT.format(content))
152
 
153
-
154
- #formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
155
- #formatted_prompt = format_prompt(f'{content}', history)
156
-
157
  stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
158
  resp = ""
159
  for response in stream:
160
  resp += response.token.text
161
- #yield resp
162
 
163
  if VERBOSE:
164
  print(LOG_RESPONSE.format(resp))
165
  return resp
166
 
167
-
168
  def compress_data(c, instruct, history):
169
- seed=random.randint(1,1000000000)
170
-
171
- print (c)
172
- #tot=len(purpose)
173
- #print(tot)
174
- divr=int(c)/MAX_DATA
175
- divi=int(divr)+1 if divr != int(divr) else int(divr)
176
  chunk = int(int(c)/divr)
177
- print(f'chunk:: {chunk}')
178
- print(f'divr:: {divr}')
179
- print (f'divi:: {divi}')
180
  out = []
181
- #out=""
182
- s=0
183
- e=chunk
184
- print(f'e:: {e}')
185
- new_history=""
186
- #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
187
  for z in range(divi):
188
- print(f's:e :: {s}:{e}')
189
-
190
  hist = history[s:e]
191
-
192
  resp = run_gpt(
193
  COMPRESS_DATA_PROMPT_SMALL,
194
  stop_tokens=["observation:", "task:", "action:", "thought:"],
@@ -199,506 +146,172 @@ def compress_data(c, instruct, history):
199
  history=hist,
200
  )
201
  out.append(resp)
202
- #new_history = resp
203
- print (resp)
204
- #out+=resp
205
- e=e+chunk
206
- s=s+chunk
207
  return out
208
 
209
-
210
- def compress_data_og(c, instruct, history):
211
- seed=random.randint(1,1000000000)
212
-
213
- print (c)
214
- #tot=len(purpose)
215
- #print(tot)
216
- divr=int(c)/MAX_DATA
217
- divi=int(divr)+1 if divr != int(divr) else int(divr)
218
- chunk = int(int(c)/divr)
219
- print(f'chunk:: {chunk}')
220
- print(f'divr:: {divr}')
221
- print (f'divi:: {divi}')
222
- out = []
223
- #out=""
224
- s=0
225
- e=chunk
226
- print(f'e:: {e}')
227
- new_history=""
228
- #task = f'Compile this data to fulfill the task: {task}, and complete the purpose: {purpose}\n'
229
- for z in range(divi):
230
- print(f's:e :: {s}:{e}')
231
-
232
- hist = history[s:e]
233
 
234
- resp = run_gpt(
235
- COMPRESS_DATA_PROMPT,
236
- stop_tokens=["observation:", "task:", "action:", "thought:"],
237
- max_tokens=8192,
238
- seed=seed,
239
- direction=instruct,
240
- knowledge=new_history,
241
- history=hist,
242
- )
243
 
244
- new_history = resp
245
- print (resp)
246
- out+=resp
247
- e=e+chunk
248
- s=s+chunk
249
- '''
250
- resp = run_gpt(
251
- COMPRESS_DATA_PROMPT,
252
- stop_tokens=["observation:", "task:", "action:", "thought:"],
253
- max_tokens=8192,
254
- seed=seed,
255
- direction=instruct,
256
- knowledge=new_history,
257
- history="All data has been recieved.",
258
- )'''
259
- print ("final" + resp)
260
- #history = "observation: {}\n".format(resp)
261
- return resp
262
-
263
-
264
-
265
- def summarize(
266
- inp: str,
267
- history: list,
268
- report_check: bool,
269
- sum_mem_check: str,
270
- data: str = None,
271
- files: list = None,
272
- url: str = None,
273
- pdf_url: str = None,
274
- pdf_batch: str = None
275
- ) -> str:
276
- """
277
- Summarizes the provided input data, processes files, URLs, and PDFs, and yields the results.
278
-
279
- Parameters:
280
- - inp (str): The input data to be processed. If empty, defaults to "Process this data".
281
- - history (list): A list to keep track of the conversation history.
282
- - report_check (bool): A flag indicating whether to return a report.
283
- - sum_mem_check (str): A string indicating whether to summarize or save memory.
284
- - data (str, optional): Additional data to process. Defaults to None.
285
- - files (list, optional): A list of file paths to process. Defaults to None.
286
- - url (str, optional): A URL to fetch data from. Defaults to None.
287
- - pdf_url (str, optional): A URL pointing to a PDF file to read. Defaults to None.
288
- - pdf_batch (str, optional): A batch of PDF URLs (comma-separated) to read. Defaults to None.
289
-
290
- Yields:
291
- - A tuple containing:
292
- - An empty string (for future use).
293
- - The updated history list.
294
- - An error box (if any errors occurred).
295
- - A JSON box for structured output.
296
-
297
- The function processes the input data, reads from specified URLs, PDFs, and files, and summarizes or saves the data based on the provided parameters.
298
- """
299
- json_box = []
300
- rawp = ""
301
- json_out = None
302
-
303
- if inp == "":
304
- inp = "Process this data"
305
-
306
- history.clear()
307
- history = [(inp, "Working on it...")]
308
- yield "", history, error_box, json_box
309
-
310
- # Process PDF batch URLs
311
- if pdf_batch and pdf_batch.startswith("http"):
312
- c = pdf_batch.count(",") + 1 # Count the number of URLs
313
- data = ""
314
- try:
315
- for i in range(c):
316
- batch_url = pdf_batch.split(",", c)[i]
317
- bb = read_pdf_online(batch_url)
318
- data = f'{data}\nFile Name URL ({batch_url}):\n{bb}'
319
- except Exception as e:
320
- print(e)
321
-
322
- # Process single PDF URL
323
- if pdf_url and pdf_url.startswith("http"):
324
- print("PDF_URL")
325
- out = read_pdf_online(pdf_url)
326
- data = out
327
-
328
- # Process regular URL
329
- if url and url.startswith("http"):
330
- val, out = find_all(inp, "", history, url, "") # Add missing arguments
331
- if not val:
332
- data = "Error"
333
- rawp = str(out) # Assign rawp here
334
  else:
335
- data = out
336
 
337
- # Process uploaded files
338
- if files:
339
- for i, file in enumerate(files):
340
- try:
341
- print(file)
342
- if file.endswith(".pdf"):
343
- zz = read_pdf(file)
344
- print(zz)
345
- data = f'{data}\nFile Name ({file}):\n{zz}'
346
- elif file.endswith(".txt"):
347
- zz = read_txt(file)
348
- print(zz)
349
- data = f'{data}\nFile Name ({file}):\n{zz}'
350
- except Exception as e:
351
- data = f'{data}\nError opening File Name ({file})'
352
- print(e)
353
-
354
- # Process the collected data
355
- if data != "Error" and data != "":
356
- print(inp)
357
- out = str(data)
358
- rl = len(out)
359
- print(f'rl:: {rl}')
360
- c = sum(1 for i in str(out) if i in [" ", ",", "\n"]) # Count delimiters
361
- print(f'c:: {c}')
362
-
363
- if sum_mem_check == "Memory":
364
- json_out = save_memory(inp, out)
365
- rawp = "Complete" # Assign rawp here
366
-
367
- if sum_mem_check == "Summarize":
368
- json_out = compress_data(c, inp, out)
369
- out = str(json_out)
370
-
371
- if report_check:
372
- rl = len(out)
373
- print(f'rl:: {rl}')
374
- c = sum(1 for i in str(out) if i in [" ", ",", "\n"]) # Count delimiters
375
- print(f'c2:: {c}')
376
- rawp = compress_data_og(c, inp, out) # Assign rawp here
377
  else:
378
- rawp = out # Assign rawp here
379
- else:
380
- rawp = "Provide a valid data source" # Assign rawp here
381
 
382
- history.clear()
383
- history.append((inp, rawp))
384
- yield "", history, error_box, json_out
385
- SAVE_MEMORY = """
386
- You are attempting to complete the task
387
- task: {task}
388
- Data:
389
- {history}
390
- Instructions:
391
- Compile and categorize the data above into a JSON dictionary string
392
- Include ALL text, datapoints, titles, descriptions, and source urls indexed into an easy to search JSON format
393
- Your final response should be only the final formatted JSON string enclosed in brackets, and nothing else.
394
- Required keys:
395
- "keywords":["short", "list", "of", "important", "keywords", "found", "in", "this", "entry"]
396
- "title":"title of entry"
397
- "description":"A sentence summarizing the topic of this entry"
398
- "content":"A brief paragraph summarizing the important datapoints found in this entry"
399
- "url":"https://url.source"
400
- """
401
 
402
- def save_memory(purpose, history):
403
- uid=uuid.uuid4()
404
- history=str(history)
405
- c=1
406
- inp = str(history)
407
- rl = len(inp)
408
- print(f'rl:: {rl}')
409
- for i in str(inp):
410
- if i == " " or i=="," or i=="\n" or i=="/" or i=="\\" or i=="." or i=="<":
411
- c +=1
412
- print (f'c:: {c}')
413
 
414
- seed=random.randint(1,1000000000)
415
-
416
- print (c)
417
- #tot=len(purpose)
418
- #print(tot)
419
- divr=int(c)/MAX_DATA
420
- divi=int(divr)+1 if divr != int(divr) else int(divr)
421
- chunk = int(int(c)/divr)
422
- print(f'chunk:: {chunk}')
423
- print(f'divr:: {divr}')
424
- print (f'divi:: {divi}')
425
- out_box = []
426
- #out=""
427
- s=0
428
- ee=chunk
429
- print(f'e:: {ee}')
430
- new_history=""
431
- task = f'Index this Data\n'
432
- for z in range(divi):
433
- print(f's:e :: {s}:{ee}')
434
-
435
- hist = inp[s:ee]
436
-
437
- resp = run_gpt(
438
- SAVE_MEMORY,
439
- stop_tokens=["observation:", "task:", "action:", "thought:"],
440
- max_tokens=4096,
441
- seed=seed,
442
- purpose=purpose,
443
- task=task,
444
- history=hist,
445
- ).strip('\n')
446
- #new_history = resp
447
- #print (resp)
448
- #out+=resp
449
-
450
- #print ("final1" + resp)
451
- try:
452
- resp='[{'+resp.split('[{')[1].split('</s>')[0]
453
- #print ("final2\n" + resp)
454
- #print(f"keywords:: {resp['keywords']}")
455
- except Exception as e:
456
- resp = resp
457
- print(e)
458
- timestamp=str(datetime.datetime.now())
459
- timename=timestamp.replace(" ","--").replace(":","-").replace(".","-")
460
- json_object=resp
461
- #json_object = json.dumps(out_box)
462
- #json_object = json.dumps(out_box,indent=4)
463
- with open(f"tmp-{uid}.json", "w") as outfile:
464
- outfile.write(json_object)
465
-
466
- outfile.close()
467
- api.upload_file(
468
- path_or_fileobj=f"tmp-{uid}.json",
469
- path_in_repo=f"/mem-test2/{timename}---{s}-{ee}.json",
470
- repo_id=reponame,
471
- #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
472
- token=token_self,
473
- repo_type="dataset",
474
- )
475
- lines = resp.strip().strip("\n").split("\n")
476
- r = requests.get(f'{save_data}mem-test2/main.json')
477
- print(f'status code main:: {r.status_code}')
478
- if r.status_code==200:
479
-
480
- lod = json.loads(r.text)
481
- #lod = eval(lod)
482
- print (f'lod:: {lod}')
483
- if not r.status_code==200:
484
- lod = []
485
- for i,line in enumerate(lines):
486
- key_box=[]
487
- print(f'LINE:: {line}')
488
- if ":" in line:
489
- print(f'line:: {line}')
490
-
491
- if "keywords" in line:
492
- print(f'trying:: {line}')
493
- keyw=line.split(":")[1]
494
- print (keyw)
495
- print (keyw.split("[")[1].split("]")[0])
496
- keyw=keyw.split("[")[1].split("]")[0]
497
- for ea in keyw.split(","):
498
- s1=""
499
- ea=ea.strip().strip("\n")
500
- for ev in ea:
501
- if ev.isalnum():
502
- s1+=ev
503
- if ev == " ":
504
- s1+=ev
505
- #ea=s1
506
- print(s1)
507
- key_box.append(s1)
508
- lod.append({"file_name":f"{timename}---{s}-{ee}","keywords":key_box,"index":f"{s}:{ee}"})
509
- json_object = json.dumps(lod, indent=4)
510
- with open(f"tmp2-{uid}.json", "w") as outfile2:
511
- outfile2.write(json_object)
512
- outfile2.close()
513
- api.upload_file(
514
- path_or_fileobj=f"tmp2-{uid}.json",
515
- path_in_repo=f"/mem-test2/main.json",
516
- repo_id=reponame,
517
- #repo_id=save_data.split('datasets/',1)[1].split('/raw',1)[0],
518
- token=token_self,
519
- repo_type="dataset",
520
- )
521
- ee=ee+chunk
522
- s=s+chunk
523
- out_box.append(resp)
524
- return out_box
525
-
526
- def create_zip_file(output_data, zip_name):
527
- with zipfile.ZipFile(zip_name, 'w') as zipf:
528
- for i, data in enumerate(output_data):
529
- zipf.writestr(f'data_{i}.txt', data)
530
- return zip_name
531
 
 
 
 
532
 
533
-
534
  def clear_fn():
535
- return "", [(None, None)]
536
 
 
537
  with gr.Blocks() as app:
538
  gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3></center>""")
539
 
540
  # Main chat interface
541
- chatbot = gr.Chatbot(
542
- label="Mixtral 8x7B Chatbot",
543
- show_copy_button=True,
544
- type='messages',
545
- height=400,
546
- purpose_input = gr.Textbox(label="Purpose"),
547
- task_input = gr.Textbox(label="Task"),
548
- history_input = gr.Textbox(label="History"),
549
- url_input = gr.Textbox(label="URL"),
550
- result_input = gr.Textbox(label="Result"),
551
- steps_input = gr.Number(label="Steps", value=3), # Default value of 3 steps
552
- output_component = gr.Textbox(label="Output"),
553
- button = gr.Button("Search"),
554
- )
555
 
556
  # Control Panel
557
  with gr.Row():
558
  with gr.Column(scale=3):
559
  prompt = gr.Textbox(
560
- label="Instructions (optional)",
561
  placeholder="Enter processing instructions here..."
562
  )
563
  steps = gr.Slider(
564
- label="Crawl Steps",
565
- minimum=1,
566
- maximum=5,
567
  value=1,
568
  info="Number of levels to crawl for web content"
569
  )
570
  with gr.Column(scale=1):
571
  report_check = gr.Checkbox(
572
- label="Return Report",
573
  value=True,
574
  info="Generate detailed analysis report"
575
  )
576
  sum_mem_check = gr.Radio(
577
- label="Output Type",
578
- choices=["Summary", "Memory"],
579
  value="Summary",
580
  info="Choose between summarized or memory-based output"
581
  )
582
- button = gr.Button("Process", variant="primary")
583
-
584
- # Clear button
585
- with gr.Row():
586
- clear_btn = gr.Button("Clear", variant="secondary")
587
 
588
  # Input Tabs
589
  with gr.Tabs() as input_tabs:
590
  with gr.Tab("πŸ“ Text"):
591
- data = gr.Textbox(
592
- label="Input Data",
593
  lines=6,
594
  placeholder="Paste your text here..."
595
  )
596
  with gr.Tab("πŸ“ File"):
597
- files = gr.File(
598
  label="Upload Files",
599
  file_types=[".pdf", ".txt"],
600
  file_count="multiple"
601
  )
602
  with gr.Tab("🌐 Web URL"):
603
- url = gr.Textbox(
604
  label="Website URL",
605
  placeholder="https://example.com"
606
  )
607
  with gr.Tab("πŸ“„ PDF URL"):
608
- pdf_url = gr.Textbox(
609
  label="PDF URL",
610
  placeholder="https://example.com/document.pdf"
611
  )
612
- with gr.Tab("πŸ“š PDF Batch"):
613
- pdf_batch = gr.Textbox(
614
- label="PDF URLs (comma separated)",
615
- placeholder="url1.pdf, url2.pdf, url3.pdf"
616
- )
617
 
618
  # Output Section
619
  with gr.Row():
620
  with gr.Column():
621
- json_out = gr.JSON(
622
  label="Structured Output",
623
  show_label=True
624
  )
625
  with gr.Column():
626
- e_box = gr.Textbox(
627
- label="Status & Errors",
628
  interactive=False
629
  )
630
-
631
- def process_and_format_response(instructions, chat_history, report, summary_memory,
632
- input_data, uploaded_files, input_url, pdf_input_url): # Removed extra parameters
633
- try:
634
- # Process the inputs with reduced parameters
635
- result = None
636
- for _ in summarize(
637
- instructions,
638
- chat_history if chat_history else [],
639
- report,
640
- summary_memory,
641
- input_data,
642
- uploaded_files,
643
- input_url,
644
- pdf_input_url # Removed extra parameters
645
- ):
646
- result = _
647
-
648
- if result:
649
- _, history, errors, json_data = result
650
-
651
- # Convert history to ChatMessage format
652
- formatted_messages = []
653
- if isinstance(history, list):
654
- for msg in history:
655
- if isinstance(msg, tuple) and len(msg) == 2:
656
- formatted_messages.extend([
657
- gr.ChatMessage(content=str(msg[0]), role="user"),
658
- gr.ChatMessage(content=str(msg[1]), role="assistant")
659
- ])
660
- else:
661
- formatted_messages.extend([
662
- gr.ChatMessage(content=str(instructions), role="user"),
663
- gr.ChatMessage(content=str(history), role="assistant")
664
- ])
665
-
666
- # Format error messages
667
- error_message = "\n".join(errors) if errors else "Processing completed successfully"
668
-
669
- return (
670
- "", # Clear the prompt
671
- formatted_messages,
672
- error_message,
673
- json_data
674
- )
675
- except Exception as e:
676
- error_msg = f"Error: {str(e)}"
677
- return (
678
- "",
679
- [
680
- gr.ChatMessage(content=str(instructions), role="user"),
681
- gr.ChatMessage(content=error_msg, role="assistant")
682
- ],
683
- error_msg,
684
- None
685
- )
686
-
687
- def clear_fn():
688
- return "", []
689
 
690
- # Update the button click event to match parameters
691
- button.click(
692
- find_all,
693
  inputs=[
694
- purpose_input, # Add these input components to your Gradio interface
695
- task_input,
696
- history_input,
 
 
 
697
  url_input,
698
- result_input,
699
- steps_input
700
  ],
701
- outputs=[output_component]
 
 
 
 
 
702
  )
703
 
704
  # Launch the app
@@ -706,5 +319,5 @@ with gr.Blocks() as app:
706
  show_api=False,
707
  share=True,
708
  server_name="0.0.0.0",
709
- server_port=7860
710
- )
 
1
  import gradio as gr
 
2
  import requests
3
  import zipfile
4
  import uuid
5
  import bs4
6
  import lxml
7
  import os
8
+ from huggingface_hub import InferenceClient, HfApi
 
9
  import random
10
  import json
11
  import datetime
12
  from pypdf import PdfReader
 
 
13
  from agent import (
14
  PREFIX,
15
  COMPRESS_DATA_PROMPT,
 
17
  LOG_PROMPT,
18
  LOG_RESPONSE,
19
  )
20
+
21
+ # Initialize Hugging Face client
22
+ client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
23
+ reponame = "acecalisto3/tmp"
24
+ save_data = f'https://huggingface.co/datasets/{reponame}/raw/main/'
25
+
26
+ # Get HF token from environment or use demo mode
27
+ token_self = os.environ.get('HF_TOKEN', 'dummy_token') # Use dummy token for demo
28
+ if token_self == 'dummy_token':
29
+ print("Warning: Running in demo mode without HuggingFace token. Some features may be limited.")
30
+ api = HfApi(token=token_self)
31
+
32
+ # Constants
33
+ VERBOSE = True
34
+ MAX_HISTORY = 100
35
+ MAX_DATA = 20000
36
 
37
  def find_all(purpose, task, history, url, result, steps):
38
  return_list = []
 
61
  return True, return_list
62
 
63
  def read_txt(txt_path):
64
+ with open(txt_path, "r") as f:
 
65
  text = f.read()
 
 
66
  return text
67
 
68
  def read_pdf(pdf_path):
69
+ text = ""
70
+ reader = PdfReader(pdf_path)
71
+ for page in reader.pages:
 
 
72
  text = f'{text}\n{page.extract_text()}'
 
73
  return text
74
 
75
+ error_box = []
76
  def read_pdf_online(url):
 
77
  print(f"reading {url}")
78
  response = requests.get(url, stream=True)
79
+ if response.status_code == 200:
80
+ with open("test.pdf", "wb") as f:
81
+ f.write(response.content)
82
+ reader = PdfReader("test.pdf")
83
+ text = ""
84
+ for page in reader.pages:
85
+ text = f'{text}\n{page.extract_text()}'
86
+ return text
87
+ else:
88
+ error_box.append(url)
89
+ return str(response.status_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def format_prompt(message, history):
92
+ prompt = "<s>"
93
+ for user_prompt, bot_response in history:
94
+ prompt += f"[INST] {user_prompt} [/INST]"
95
+ prompt += f" {bot_response}</s> "
96
+ prompt += f"[INST] {message} [/INST]"
97
+ return prompt
98
+
99
+ def run_gpt(prompt_template, stop_tokens, max_tokens, seed, **prompt_kwargs):
100
+ timestamp = datetime.datetime.now()
 
 
 
 
 
 
 
 
 
101
 
102
  generate_kwargs = dict(
103
  temperature=0.9,
 
112
  timestamp=timestamp,
113
  purpose="Compile the provided data and complete the users task"
114
  ) + prompt_template.format(**prompt_kwargs)
115
+
116
  if VERBOSE:
117
  print(LOG_PROMPT.format(content))
118
 
 
 
 
 
119
  stream = client.text_generation(content, **generate_kwargs, stream=True, details=True, return_full_text=False)
120
  resp = ""
121
  for response in stream:
122
  resp += response.token.text
 
123
 
124
  if VERBOSE:
125
  print(LOG_RESPONSE.format(resp))
126
  return resp
127
 
 
128
  def compress_data(c, instruct, history):
129
+ seed = random.randint(1, 1000000000)
130
+ divr = int(c)/MAX_DATA
131
+ divi = int(divr)+1 if divr != int(divr) else int(divr)
 
 
 
 
132
  chunk = int(int(c)/divr)
 
 
 
133
  out = []
134
+ s = 0
135
+ e = chunk
136
+
 
 
 
137
  for z in range(divi):
 
 
138
  hist = history[s:e]
 
139
  resp = run_gpt(
140
  COMPRESS_DATA_PROMPT_SMALL,
141
  stop_tokens=["observation:", "task:", "action:", "thought:"],
 
146
  history=hist,
147
  )
148
  out.append(resp)
149
+ e = e+chunk
150
+ s = s+chunk
 
 
 
151
  return out
152
 
153
+ def create_zip_file(output_data, zip_name):
154
+ with zipfile.ZipFile(zip_name, 'w') as zipf:
155
+ for i, data in enumerate(output_data):
156
+ zipf.writestr(f'data_{i}.txt', data)
157
+ return zip_name
158
+
159
+ def process_and_format_response(instructions, chat_history, report, summary_memory,
160
+ input_data, uploaded_files, input_url, pdf_input_url):
161
+ try:
162
+ # Process URL if provided
163
+ if input_url:
164
+ success, content = find_all("Extract content", "", [], input_url, "", 1)
165
+ if success and content:
166
+ processed_text = "\n".join(content)
167
+ else:
168
+ return "", [["Error", "Failed to fetch URL content"]], "URL processing failed", None
 
 
 
 
 
 
 
 
169
 
170
+ # Process uploaded files
171
+ elif uploaded_files:
172
+ processed_text = ""
173
+ for file in uploaded_files:
174
+ if file.name.endswith('.pdf'):
175
+ processed_text += read_pdf(file.name) + "\n\n"
176
+ elif file.name.endswith('.txt'):
177
+ processed_text += read_txt(file.name) + "\n\n"
 
178
 
179
+ # Process direct text input
180
+ elif input_data:
181
+ processed_text = input_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  else:
183
+ return "", [["Error", "No input provided"]], "No input data", None
184
 
185
+ # Generate summary using compress_data
186
+ if processed_text:
187
+ c = len(processed_text.split())
188
+ summary = compress_data(c, instructions or "Summarize this text", processed_text)
189
+
190
+ # Format the response
191
+ if isinstance(summary, list):
192
+ summary_text = "\n".join(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  else:
194
+ summary_text = str(summary)
 
 
195
 
196
+ # Create chat messages
197
+ messages = [
198
+ ["Input", processed_text[:500] + "..."], # Show first 500 chars of input
199
+ ["Summary", summary_text]
200
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
+ # Create JSON output
203
+ json_output = {
204
+ "input_length": len(processed_text),
205
+ "summary_length": len(summary_text),
206
+ "summary": summary_text
207
+ }
 
 
 
 
 
208
 
209
+ return "", messages, "Processing completed successfully", json_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
+ except Exception as e:
212
+ error_msg = f"Error: {str(e)}"
213
+ return "", [["Error", error_msg]], error_msg, None
214
 
 
215
  def clear_fn():
216
+ return "", []
217
 
218
+ # Create Gradio interface
219
  with gr.Blocks() as app:
220
  gr.HTML("""<center><h1>Mixtral 8x7B TLDR Summarizer + Web</h1><h3>Summarize Data of unlimited length</h3></center>""")
221
 
222
  # Main chat interface
223
+ with gr.Row():
224
+ chatbot = gr.Chatbot(
225
+ label="Mixtral 8x7B Chatbot",
226
+ show_copy_button=True,
227
+ height=400
228
+ )
 
 
 
 
 
 
 
 
229
 
230
  # Control Panel
231
  with gr.Row():
232
  with gr.Column(scale=3):
233
  prompt = gr.Textbox(
234
+ label="Instructions",
235
  placeholder="Enter processing instructions here..."
236
  )
237
  steps = gr.Slider(
238
+ label="Crawl Steps",
239
+ minimum=1,
240
+ maximum=5,
241
  value=1,
242
  info="Number of levels to crawl for web content"
243
  )
244
  with gr.Column(scale=1):
245
  report_check = gr.Checkbox(
246
+ label="Return Report",
247
  value=True,
248
  info="Generate detailed analysis report"
249
  )
250
  sum_mem_check = gr.Radio(
251
+ label="Output Type",
252
+ choices=["Summary", "Memory"],
253
  value="Summary",
254
  info="Choose between summarized or memory-based output"
255
  )
256
+ process_btn = gr.Button("Process", variant="primary")
 
 
 
 
257
 
258
  # Input Tabs
259
  with gr.Tabs() as input_tabs:
260
  with gr.Tab("πŸ“ Text"):
261
+ text_input = gr.Textbox(
262
+ label="Input Text",
263
  lines=6,
264
  placeholder="Paste your text here..."
265
  )
266
  with gr.Tab("πŸ“ File"):
267
+ file_input = gr.File(
268
  label="Upload Files",
269
  file_types=[".pdf", ".txt"],
270
  file_count="multiple"
271
  )
272
  with gr.Tab("🌐 Web URL"):
273
+ url_input = gr.Textbox(
274
  label="Website URL",
275
  placeholder="https://example.com"
276
  )
277
  with gr.Tab("πŸ“„ PDF URL"):
278
+ pdf_url_input = gr.Textbox(
279
  label="PDF URL",
280
  placeholder="https://example.com/document.pdf"
281
  )
 
 
 
 
 
282
 
283
  # Output Section
284
  with gr.Row():
285
  with gr.Column():
286
+ json_output = gr.JSON(
287
  label="Structured Output",
288
  show_label=True
289
  )
290
  with gr.Column():
291
+ error_output = gr.Textbox(
292
+ label="Status & Errors",
293
  interactive=False
294
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
296
+ # Event handlers
297
+ process_btn.click(
298
+ process_and_format_response,
299
  inputs=[
300
+ prompt,
301
+ chatbot,
302
+ report_check,
303
+ sum_mem_check,
304
+ text_input,
305
+ file_input,
306
  url_input,
307
+ pdf_url_input
 
308
  ],
309
+ outputs=[
310
+ prompt,
311
+ chatbot,
312
+ error_output,
313
+ json_output
314
+ ]
315
  )
316
 
317
  # Launch the app
 
319
  show_api=False,
320
  share=True,
321
  server_name="0.0.0.0",
322
+ server_port=8000
323
+ )