Yaron Koresh commited on
Commit
1281fb4
·
verified ·
1 Parent(s): 685e8ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -36
app.py CHANGED
@@ -559,51 +559,54 @@ def get_tensor_length(tensor):
559
  ret = ret * num
560
  return ret
561
 
562
- def summarize(
563
- text, max_len=20, min_len=10
564
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
565
  log(f'CALL summarize')
566
 
567
  words = text.split()
568
 
569
  if len(words) < 5:
570
- print("Summarization Error: Text is too short, 5 words minimum!")
571
  return text
572
 
573
- prefix = "summarize: "
574
- ret = ""
 
575
 
576
- for index in range(math.ceil( len(words) / 500 )):
577
-
578
- chunk = " ".join(words[ index*500:(index+1)*500 ])
579
- inputs = tokenizer.encode( prefix + chunk, return_tensors="pt", truncation=False, add_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
580
 
581
- while get_tensor_length(inputs) > max_len:
582
-
583
- inputs = model.generate(
584
- inputs,
585
- length_penalty=2.0,
586
- num_beams=4,
587
- early_stopping=True,
588
- max_length=max( get_tensor_length(inputs) // 4 , max_len ),
589
- min_length=min_len
590
- )
591
-
592
- toks = tokenizer.decode(inputs[0], skip_special_tokens=True)
593
- ret = ret + ("" if ret == "" else " ") + toks
594
-
595
- inputs = tokenizer.encode( prefix + ret, return_tensors="pt", truncation=False)
596
- gen = model.generate(
597
- inputs,
598
- length_penalty=1.0,
599
- num_beams=4,
600
- early_stopping=True,
601
- max_length=max_len,
602
- min_length=min_len
603
- )
604
- summary = tokenizer.decode(gen[0], skip_special_tokens=True)
605
- log(f'RET summarize with summary as {summary}')
606
- return summary
607
 
608
  def generate_random_string(length):
609
  characters = str(ascii_letters + digits)
 
559
  ret = ret * num
560
  return ret
561
 
562
+ def _summarize(text, max_words=20):
563
+ prefix = "summarize: "
564
+ toks = tokenizer.encode( prefix + text, return_tensors="pt", truncation=False)
565
+ gen = model.generate(
566
+ toks,
567
+ length_penalty=2.0,
568
+ num_beams=max( get_tensor_length(toks) // 4 , 4 ),
569
+ early_stopping=True,
570
+ max_length=max_words
571
+ )
572
+ return tokenizer.decode(gen[0], skip_special_tokens=True)
573
+
574
+ def _summ_step(length):
575
+ return max(length // 3 , min(10,length-4))
576
+
577
+ def summarize(text, max_words=20):
578
  log(f'CALL summarize')
579
 
580
  words = text.split()
581
 
582
  if len(words) < 5:
583
+ print("Summarization Error: Text is too short, 5 words minimum.")
584
  return text
585
 
586
+ if max_words < 5 or max_words > 500:
587
+ print("Summarization Error: max_words value must be between 5 and 500 words.")
588
+ return text
589
 
590
+ words_length = len(text.split())
591
+
592
+ if words_length >= 510:
593
+ shrink_step = 500 // (len(text.split()) / 500)
594
+ while words_length >= 510:
595
+ words = text.split()
596
+ above_limit = words[510:]
597
+ text = _summarize(
598
+ " ".join(words[0:510]), shrink_step
599
+ ) + " ".join(words[510:])
600
+ words_length = len(text.split())
601
+
602
+ while words_length > max_words:
603
+ step = _summ_step(words_length)
604
+ mx = words_length - step
605
+ text = _summarize(text, mx)
606
+ words_length = len(text.split())
607
 
608
+ log(f'RET summarize with text as {text}')
609
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610
 
611
  def generate_random_string(length):
612
  characters = str(ascii_letters + digits)