billusanda007 commited on
Commit
4d5914c
·
verified ·
1 Parent(s): 70b9637

Upload xmlGrad.py

Browse files
Files changed (1) hide show
  1. xmlGrad.py +620 -0
xmlGrad.py ADDED
@@ -0,0 +1,620 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled34.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1p8LZ5eICRuSfjSRLGIDv4TDW32GSm4Wf
8
+ """
9
+
10
+ #!pip install torch gradio transformers pandas langchain-fireworks fireworks stanza sentence_transformers anytree
11
+
12
+ import torch
13
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
14
+ import gradio as gr
15
+ import pandas as pd
16
+ from collections import Counter, defaultdict
17
+ import os
18
+ from huggingface_hub import login
19
+ import requests
20
+ from bs4 import BeautifulSoup
21
+ import numpy as np
22
+ import re
23
+ from sklearn.feature_extraction.text import TfidfVectorizer
24
+ from sklearn.metrics.pairwise import cosine_similarity
25
+ from googlesearch import search
26
+ import time
27
+ import random
28
+ from lxml import html
29
+
30
+ import nltk
31
+ nltk.download('punkt')
32
+ from sentence_transformers import SentenceTransformer, util
33
+ model_ranker = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
34
+
35
+ Question = [
36
+ "RG Kar recent rape and murder case"
37
+ # "Who won the physics nobel prize in 2023?",
38
+ # "Who has been awarded the Nobel Prize in Physics in 2023
39
+ ]
40
+
41
+ headers = {
42
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
43
+ }
44
+ exclude=["Thank you for your patience","Subscribe","subscribe","trouble retrieving the article content","browser settings",
45
+ "Thank you for your patience while we verify access. If you are in Reader mode please exit and log into your Times account, or subscribe for all of The Times.",
46
+ "Thank you for your patience while we verify access.",
47
+ "Already a subscriber? Log in.",
48
+ "Want all of The Times? Subscribe.",
49
+ "Advertisement",
50
+ "Site Index",
51
+ "Thank you for your patience while we verify access. If you are in Reader mode please exit andlog intoyour Times account, orsubscribefor all of The Times.",
52
+ "Already a subscriber?Log in.",
53
+ "Want all of The Times?Subscribe.",
54
+ "Site Information Navigation",
55
+ "Please enable JS and disable any ad blocker"
56
+ ]
57
+
58
+ def fetch_article_text_sequential(url):
59
+ headers = {
60
+ "Content-Type": "application/json",
61
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
62
+
63
+ }
64
+
65
+ exclude=[
66
+ "Thank you for your patience","Subscribe","subscribe","trouble retrieving the article content","browser settings",
67
+ "Thank you for your patience while we verify access. If you are in Reader mode please exit and log into your Times account, or subscribe for all of The Times.",
68
+ "Thank you for your patience while we verify access.",
69
+ "Already a subscriber? Log in.",
70
+ "Want all of The Times? Subscribe.",
71
+ "Advertisement",
72
+ "Site Index",
73
+ "Thank you for your patience while we verify access. If you are in Reader mode please exit andlog intoyour Times account, orsubscribefor all of The Times.",
74
+ "Already a subscriber?Log in.",
75
+ "Want all of The Times?Subscribe.",
76
+ "Site Information Navigation"
77
+ ]
78
+
79
+ try:
80
+
81
+ # Send a request to the webpage with the specified headers
82
+ response = requests.get(url, headers=headers,timeout=5)
83
+ response.raise_for_status() # Check that the request was successful
84
+
85
+ # Parse the webpage content
86
+ soup = BeautifulSoup(response.text, 'html.parser')
87
+
88
+ # Initialize an empty list to store the text sequentially
89
+ article_content = []
90
+
91
+ # Define the tags we are interested in (headlines and paragraphs)
92
+ tags_of_interest = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']
93
+
94
+ # Find all tags of interest in the order they appear in the document
95
+ for tag in soup.find_all(tags_of_interest):
96
+ if not any(excluded_phrase in tag.get_text() for excluded_phrase in exclude):
97
+ text = tag.get_text(strip=True)
98
+ article_content.append(text)
99
+
100
+ return '\n'.join(article_content)
101
+
102
+ except:
103
+ return None
104
+
105
+ def fetch_article_text_sequential_new(url):
106
+ user_agents = [
107
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
108
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
109
+ # Add more User-Agents here
110
+ ]
111
+ headers = {
112
+ 'User-Agent': random.choice(user_agents)
113
+ }
114
+
115
+ try:
116
+ response =requests.get(url,timeout=5,verify=False,headers=headers)
117
+ response.raise_for_status() # Check for HTTP errors
118
+ response.encoding = 'utf-8'
119
+ content = response.text
120
+ if not content.strip():
121
+ return ""
122
+ try:
123
+ tree = html.fromstring(content)
124
+ except:
125
+ return ""
126
+ # Extract all paragraph
127
+ scraped_data = []
128
+ tags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']
129
+ for tag in tags:
130
+ for element in tree.xpath(f'//{tag}'):
131
+ scraped_data.append(element.text_content())
132
+ return '\n'.join(scraped_data)
133
+ except:
134
+ return ""
135
+
136
+
137
+ def get_google_search_results(query, start=0):
138
+ search_url = "https://www.google.com/search"
139
+ params = {"q": query, "start": start}
140
+
141
+ response = requests.get(search_url,timeout=5,verify=False, params=params, headers=headers)
142
+ soup = BeautifulSoup(response.text, "html.parser")
143
+
144
+ search_results = []
145
+ for g in soup.find_all(class_="g"):
146
+ title = g.find("h3").text if g.find("h3") else "No title"
147
+ link = g.find("a")["href"] if g.find("a") else "No link"
148
+
149
+ if not link.lower().endswith(('.pdf', '.PDF')):
150
+ search_results.append({"title": title, "link": link})
151
+
152
+ return search_results
153
+
154
+
155
+
156
+ def fetch_sentences_from_html(html):
157
+ try:
158
+ # Parse the string with BeautifulSoup
159
+ if html == None:
160
+ return []
161
+ soup = BeautifulSoup(html, 'html.parser')
162
+ paragraphs = soup.find_all("p")
163
+ text = " ".join(p.get_text() for p in paragraphs)
164
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
165
+
166
+ #print(sentences)
167
+
168
+ return sentences
169
+ except Exception as e:
170
+ #print(f"Failed to fetch {html}: {str(e)}")
171
+ return []
172
+
173
+
174
+
175
+ # Function to rank sentences using cosine similarity
176
+ def rank_sentences(sentences):
177
+ if not sentences:
178
+ return [] # Return an empty list if no sentences are found
179
+
180
+ embeddings = model_ranker.encode(sentences, convert_to_tensor=True)
181
+
182
+ # Compute pairwise cosine similarity between sentences
183
+ similarities = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()
184
+
185
+ # Calculate the average similarity for each sentence
186
+ avg_similarities = np.mean(similarities, axis=1)
187
+
188
+ # Rank sentences based on their average similarity
189
+ ranked_sentences = sorted(zip(sentences, avg_similarities), key=lambda x: x[1], reverse=True)
190
+ ranked_sentences = [sentence for sentence, _ in ranked_sentences]
191
+
192
+
193
+ return ranked_sentences[:min(len(ranked_sentences),2000)]
194
+
195
+
196
+
197
+ def rank_sentences_new(sentences, query, top_n=20):
198
+ if sentences == None:
199
+ return []
200
+ sentences = re.split("\n", sentences.strip())
201
+ # Remove any empty strings from the list
202
+ [sentence.strip() for sentence in sentences if sentence.strip()]
203
+ vectorizer = TfidfVectorizer().fit_transform([query] + sentences)
204
+ vectors = vectorizer.toarray()
205
+ query_vector = vectors[0]
206
+ sentences_vectors = vectors[1:]
207
+ cosine_similarities = cosine_similarity([query_vector], sentences_vectors).flatten()
208
+ ranked_indices = cosine_similarities.argsort()[-top_n:][::-1]
209
+ return [sentences[idx] for idx in ranked_indices]
210
+
211
+
212
+
213
+ domains = [
214
+ "wikipedia.org", "nytimes.com", "cnn.com", "bbc.com", "theguardian.com",
215
+ "forbes.com", "reuters.com", "cnbc.com", "bloomberg.com", "foxnews.com",
216
+ "npr.org", "washingtonpost.com", "wsj.com", "aljazeera.com", "ft.com",
217
+ "huffpost.com", "nationalgeographic.com", "scientificamerican.com",
218
+ "nature.com", "time.com", "usatoday.com", "apnews.com", "abcnews.go.com",
219
+ "cbsnews.com", "nbcnews.com", "news.yahoo.com", "theatlantic.com",
220
+ "vox.com", "politico.com", "economist.com"
221
+ ]
222
+
223
+
224
+
225
+ # Define number of results we want to retrieve
226
+ num_results_needed = 40
227
+ all_results = []
228
+ start = 0
229
+
230
+
231
+ def get_web_content(user_query,num_results_needed):
232
+
233
+ num = 50
234
+ all_results = search(user_query,num)
235
+ t1=time.time()
236
+ text_combined=[]
237
+ web_context=[]
238
+ for result in all_results:
239
+ text = fetch_article_text_sequential_new(result)
240
+ print("===============================")
241
+ print(result)
242
+ print("\n\n")
243
+ print(text)
244
+ print("===============================")
245
+ text= text.splitlines()
246
+ text_combined.extend(text)
247
+
248
+ for line in text_combined:
249
+ if not any(excluded_phrase in line for excluded_phrase in exclude):
250
+ if len(line.split())>8:
251
+ web_context.append(line)
252
+
253
+ top_sentences = rank_sentences(web_context)
254
+ t2=time.time()
255
+ minutes, seconds = divmod(t2-t1, 60)
256
+
257
+ print(f"{minutes} minutes and {seconds} seconds")
258
+
259
+
260
+ ans = "\n".join(sentence.strip() for sentence in top_sentences if sentence.strip())
261
+ return ans
262
+
263
+ # Get the token from the environment variable
264
+ api_token = os.getenv('HF_TOKEN')
265
+
266
+ # Load pre-trained model and tokenizer
267
+ model_name = "gpt2-large"
268
+ model = GPT2LMHeadModel.from_pretrained(model_name)
269
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
270
+
271
+ #device = torch.device("mps")
272
+ #model.to(device)
273
+ model.eval()
274
+
275
+
276
+
277
+ def create_ngrams(tokens, n): return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
278
+
279
+
280
+ ###Smoothing___
281
+ def kneser_ney_smoothing(ngram_counts, lower_order_counts, discount=0.75):
282
+ """
283
+ Apply Kneser-Ney smoothing to n-gram counts.
284
+
285
+ Args:
286
+ ngram_counts (Counter): Counts of n-grams (e.g., 4-grams or 3-grams).
287
+ lower_order_counts (Counter): Counts of (n-1)-grams (e.g., 3-grams or 2-grams).
288
+ discount (float): Discounting parameter.
289
+
290
+ Returns:
291
+ defaultdict: Smoothed probabilities.
292
+ """
293
+ continuation_counts = Counter()
294
+ lower_counts = Counter()
295
+
296
+ for ngram in ngram_counts:
297
+ lower_ngram = ngram[1:]
298
+ continuation_counts[lower_ngram] += 1
299
+ lower_counts[lower_ngram] += 1
300
+
301
+ def continuation_probability(word):
302
+ return continuation_counts[word] / sum(continuation_counts.values())
303
+
304
+ probabilities = defaultdict(lambda: defaultdict(float))
305
+
306
+ for ngram, count in ngram_counts.items():
307
+ lower_ngram = ngram[:-1]
308
+ lower_count = lower_order_counts[lower_ngram]
309
+ discounted_count = max(count - discount, 0)
310
+ lambda_factor = (discount / lower_count) * len(continuation_counts)
311
+ probabilities[lower_ngram][ngram[-1]] = (discounted_count / lower_count) + lambda_factor * continuation_probability(ngram[-1])
312
+
313
+ return probabilities
314
+
315
+
316
+ def get_probability_from_context(Context):
317
+
318
+ context_tokens = tokenizer.tokenize(Context)
319
+ four_grams = create_ngrams(context_tokens, 4)
320
+ three_grams = create_ngrams(context_tokens, 3)
321
+ four_gram_counts = Counter(four_grams)
322
+ three_gram_counts = Counter(three_grams)
323
+ probabilities = kneser_ney_smoothing(four_gram_counts, three_gram_counts)
324
+
325
+ return probabilities, four_gram_counts, three_gram_counts
326
+
327
+
328
+ def predict_next_token(probabilities, three_gram):
329
+ return probabilities.get(three_gram, {})
330
+
331
+
332
+ def generate_text_with_probs(initial_context, context_text , top_p, max_length, top_k, threshold=0.6):
333
+
334
+ Tokens = {}
335
+
336
+ #input_ids = tokenizer.encode(initial_context, return_tensors="pt").to(device='mps')
337
+ input_ids = tokenizer.encode(initial_context, return_tensors="pt").to(device='cpu')
338
+ generated_text = initial_context
339
+ token_tables = []
340
+
341
+ token_no = 1
342
+
343
+ context_tokens = tokenizer.tokenize(context_text)
344
+
345
+ four_grams = create_ngrams(context_tokens, 4)
346
+ three_grams = create_ngrams(context_tokens, 3)
347
+ two_grams = create_ngrams(context_tokens, 2)
348
+ one_grams = create_ngrams(context_tokens, 1)
349
+
350
+ four_gram_counts = Counter(four_grams)
351
+ three_gram_counts = Counter(three_grams)
352
+ two_grams_counts = Counter(two_grams)
353
+ one_grams_counts = Counter(one_grams)
354
+
355
+ prob_list = ["four_gram", "three_gram", "two_gram", "one_gram"] # Define prob_list here
356
+
357
+
358
+ prob = [four_gram_counts ,three_gram_counts ,two_grams_counts ,one_grams_counts]
359
+ probs = kneser_ney_smoothing(four_gram_counts, three_gram_counts)
360
+
361
+ use_llm = 0
362
+ use_llm_back_up = 0
363
+ use_ngram = 0
364
+
365
+ flag = False
366
+ count = 0
367
+
368
+ Token_index = 0
369
+ colored_text = initial_context
370
+
371
+
372
+ with torch.no_grad():
373
+
374
+ #while len(generated_text.split()) < max_length:
375
+ for _ in range(max_length):
376
+
377
+ outputs = model(input_ids=input_ids)
378
+ next_token_logits = outputs.logits[:, -1, :]
379
+
380
+ sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
381
+ cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
382
+ sorted_indices_to_remove = cumulative_probs > top_p
383
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
384
+ sorted_indices_to_remove[..., 0] = 0
385
+
386
+ indices_to_remove = sorted_indices[sorted_indices_to_remove]
387
+ next_token_logits[:, indices_to_remove] = -float('Inf')
388
+ probabilities = torch.softmax(next_token_logits, dim=-1)
389
+
390
+ top_tokens = sorted_indices[0, :top_k]
391
+ top_probs = probabilities[0, top_tokens]
392
+ top_token_probs = [(tokenizer.decode([token.item()]), prob.item()) for token, prob in zip(top_tokens, top_probs)]
393
+
394
+ df = pd.DataFrame(top_token_probs, columns=["Token", "Probability"])
395
+ df.index = df.index + 1
396
+ token_tables.append((f"{token_no}>> Next token options from LLM", df))
397
+
398
+
399
+
400
+ ##print("Next token options from LLM")
401
+ ##print(df)
402
+
403
+ cumulative_prob = cumulative_probs[0, top_k - 1].item()
404
+ ##print(f"cumulative_prob from LLM: {cumulative_prob}")
405
+ entropy = (-1)*np.sum(np.array(df['Probability'])*np.log(df['Probability']))
406
+ ##print("LLM Entropy:",(-1)*np.sum(np.array(df['Probability'])*np.log(df['Probability'])))
407
+ ##print("\n")
408
+
409
+ input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
410
+ input_tokens = tokenizer.tokenize(input_text)
411
+
412
+ use_llm += 1
413
+ __token_pob__ = {}
414
+
415
+ num = 0
416
+ num_ = 4
417
+ while __token_pob__ == {} and num < 3:
418
+
419
+ probs = kneser_ney_smoothing(prob[num],prob[num+1])
420
+ __inputs__ = tuple(input_tokens[-(3-num):])
421
+ __token_pob__ = probs.get(__inputs__, {})
422
+
423
+ ##print(num,"\n",num_)
424
+
425
+ num += 1
426
+ num_ -= 1
427
+
428
+
429
+
430
+
431
+ ##print(f"Next word probs N_GRAM:{__token_pob__},\n input_{num_}_gram: {__inputs__},\n using {prob_list[num]}_counter and {prob_list[num-1]}_counter; probability exist: {__token_pob__ != {}}")
432
+ df = pd.DataFrame(list(__token_pob__.items()), columns=['Token', 'Probability'])
433
+ df.index = df.index + 1
434
+ token_tables.append((f"{token_no}>> Next token options from N_gram", df))
435
+
436
+ token_no +=1
437
+ ##print(f"Next token options from N_GRAM:")
438
+ ##print(df)
439
+ ##print("Cumulative Probability of N_gram:",np.sum(df['Probability']))
440
+
441
+ #print("\n")
442
+
443
+ if cumulative_prob < threshold and __token_pob__ != {} and flag == True and count >= 4 or np.sum(df['Probability']) > cumulative_prob:
444
+ Token_index+=1
445
+ #if cumulative_prob < threshold and __token_pob__ != {} and flag == True and count >= 4 or entropy >= 0.6:
446
+
447
+
448
+ ##print("Using n-gram model")
449
+ next_token = max(__token_pob__, key=__token_pob__.get)
450
+
451
+ if next_token == 'Ċ':
452
+ sorted_tokens = sorted(__token_pob__.items(), key=lambda x: x[1], reverse=True)
453
+ if len(sorted_tokens) > 1:
454
+ next_token = sorted_tokens[1][0]
455
+ ##print("Second max token : ", next_token)
456
+ Tokens[Token_index] = [next_token,"ngram",__token_pob__[next_token]]
457
+ #######
458
+ color_code = "#78bfd3" # Light blue for n-gram
459
+ colored_text += f"<span style='color: {color_code}'>{tokenizer.convert_tokens_to_string(next_token)}</span>"
460
+ else:
461
+ Tokens[Token_index] = [next_token,"ngram",__token_pob__[next_token]]
462
+ ######
463
+ color_code = "#78bfd3" # Light blue for n-gram
464
+ colored_text += f"<span style='color: {color_code}'>{tokenizer.convert_tokens_to_string(next_token)}</span>"
465
+
466
+
467
+
468
+ ##print("n-gram token : ",next_token)
469
+ input_tokens.append(next_token)
470
+ generated_text = tokenizer.convert_tokens_to_string(input_tokens)
471
+
472
+ ##print(generated_text)
473
+ initial_context = generated_text
474
+ #input_ids = tokenizer.encode(generated_text, return_tensors="pt").to(device='mps')
475
+ input_ids = tokenizer.encode(generated_text, return_tensors="pt").to(device='cpu')
476
+
477
+
478
+ use_ngram += 1
479
+
480
+ else:
481
+
482
+ ##print("Using LLM")
483
+ Token_index+=1
484
+ next_token = torch.multinomial(probabilities, num_samples=1)
485
+ next_token_prob = probabilities[0, next_token].item()
486
+ next_token_text = tokenizer.decode(next_token.item())
487
+
488
+ ##print("LLM token : ",next_token_text)
489
+ Tokens[Token_index] = [next_token_text,"llm",next_token_prob]
490
+ color_code = "#c99a6e"
491
+ colored_text += f"<span style='color: {color_code}'>{next_token_text}</span>"
492
+ count += 1
493
+
494
+ if count >= 4:
495
+ flag = True
496
+
497
+ #token_no += 1
498
+ input_ids = torch.cat([input_ids, next_token], dim=-1)
499
+
500
+
501
+ if next_token.item() == tokenizer.eos_token_id:
502
+ break
503
+ generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
504
+ ##print(generated_text)
505
+ initial_context = generated_text
506
+ use_llm_back_up += 1
507
+
508
+ ##print(initial_context)
509
+ ##print('-------------------------------------------------------------------------------------------------------------------------------------------------------------\n\n')
510
+ ##print("\n\n")
511
+
512
+ generated_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
513
+
514
+ #total = use_llm + use_llm_back_up + use_ngram
515
+
516
+ ##print(f"total: {use_llm} ({(use_llm / total) * 100:.2f}%)")
517
+ ##print(f"use_llms: {use_llm_back_up} ({(use_llm_back_up / total) * 100:.2f}%)")
518
+ ##print(f"use_ngram: {use_ngram} ({(use_ngram / total) * 100:.2f}%)")
519
+ ##print('-------------------------------------------------------------------------------------------------------------------------------------------------------------\n\n')
520
+
521
+
522
+
523
+
524
+
525
+ return generated_text, Tokens, token_tables,colored_text
526
+
527
+
528
+ def save_content_as_file(question, docs):
529
+ # Fetch the web content based on the question
530
+ content = get_web_content(question, docs)
531
+
532
+ # Define file path to save the content
533
+ file_path = "fetched_content.txt"
534
+
535
+ # Write the content to a text file
536
+ with open(file_path, "w") as f:
537
+ f.write(content)
538
+
539
+ # Return the file path to download
540
+ return file_path
541
+
542
+
543
+
544
+ '''def combined_model_predictions(query, initial_context, top_p, max_length, top_k, threshold, docs):
545
+ Question = [query]
546
+ context_text = get_web_content(Question[0], docs)
547
+ print('Content Fetched')
548
+ generated_text, tokens, token_tables, colored_html = generate_text_with_probs(initial_context, context_text, top_p, max_length, top_k, threshold)
549
+ data_list = [(token_index, tupes[0], tupes[1], tupes[2]) for token_index, tupes in tokens.items()]
550
+ df = pd.DataFrame(data_list, columns=['Token_pos', 'Token', 'Source Model', "Probability"])
551
+
552
+ return colored_html, df, token_tables
553
+
554
+
555
+ iface = gr.Interface(
556
+ fn=combined_model_predictions,
557
+ inputs=[
558
+ gr.Textbox(lines=2,placeholder="Enter query here..."),
559
+ gr.Textbox(lines=2,placeholder="Enter initial context here..."),
560
+ gr.Slider(0, 1, step=0.01, value=0.9, label="Top-p (nucleus) sampling"),
561
+ gr.Slider(1, 100, value= 4, step=1, label="Max Length"),
562
+ gr.Slider(1, 50, value= 5, step=1, label="Top-k"),
563
+ gr.Slider(0, 1, step=0.01, value=0.9, label="LLM cumulative Threshold"),
564
+ gr.Slider(1, 50, step=1, value=10, label="Web_retrieved Docs to fetch")
565
+ ],
566
+ outputs=[
567
+ gr.HTML(label="Generated Text"),
568
+ gr.Dataframe(label="Tokens"),
569
+ gr.Dataframe(label="Token tables"),
570
+ ],
571
+ title="Next Token Visualizer (GPT-2-large - 812M param.)"
572
+ )
573
+
574
+ iface.launch()'''
575
+
576
+ import pandas as pd
577
+ import gradio as gr
578
+
579
+ def combined_model_predictions(query, initial_context, top_p, max_length, top_k, threshold, docs):
580
+ Question = [query]
581
+ context_text = get_web_content(Question[0], docs)
582
+ print('Content Fetched')
583
+
584
+ # Write context_text to a .txt file
585
+ file_name = "context_corpora.txt"
586
+ with open(file_name, "w") as file:
587
+ file.write(context_text)
588
+
589
+ # Generate the text using the model
590
+ generated_text, tokens, token_tables, colored_html = generate_text_with_probs(initial_context, context_text, top_p, max_length, top_k, threshold)
591
+
592
+ # Create a DataFrame for tokens
593
+ data_list = [(token_index, tupes[0], tupes[1], tupes[2]) for token_index, tupes in tokens.items()]
594
+ df = pd.DataFrame(data_list, columns=['Token_pos', 'Token', 'Source Model', "Probability"])
595
+
596
+ # Return the file path for download, colored HTML, and DataFrames
597
+ return file_name, colored_html, df, token_tables
598
+
599
+ # Gradio interface
600
+ iface = gr.Interface(
601
+ fn=combined_model_predictions,
602
+ inputs=[
603
+ gr.Textbox(lines=2, placeholder="Enter query here..."),
604
+ gr.Textbox(lines=2, placeholder="Enter initial context here..."),
605
+ gr.Slider(0, 1, step=0.01, value=0.9, label="Top-p (nucleus) sampling"),
606
+ gr.Slider(1, 100, value=4, step=1, label="Max Length"),
607
+ gr.Slider(1, 50, value=5, step=1, label="Top-k"),
608
+ gr.Slider(0, 1, step=0.01, value=0.9, label="LLM cumulative Threshold"),
609
+ gr.Slider(1, 50, step=1, value=10, label="Web_retrieved Docs to fetch")
610
+ ],
611
+ outputs=[
612
+ gr.File(label="Download Context Corpora"),
613
+ gr.HTML(label="Generated Text"),
614
+ gr.Dataframe(label="Tokens"),
615
+ gr.Dataframe(label="Token tables"),
616
+ ],
617
+ title="Next Token Visualizer (GPT-2-large - 812M param.)"
618
+ )
619
+
620
+ iface.launch()