Yaron Koresh commited on
Commit
f968442
·
verified ·
1 Parent(s): a3ed68b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -22
app.py CHANGED
@@ -558,24 +558,45 @@ def summarize(
558
  text, max_len=20, min_len=10
559
  ):
560
  log(f'CALL summarize')
561
- inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=float('inf'), truncation=False)
562
- if get_tensor_length(inputs) < 3:
563
- print("Summarization Error: Text is too short, 3 words minimum!")
 
 
564
  return text
565
- i = 1
566
- while get_tensor_length(inputs) > max_len:
567
- print(f'DBG summarize 1 {i}')
568
- outputs = model.generate(
569
- torch.tensor(list(inputs[0][:512])),
570
- length_penalty=2.0,
571
- num_beams=min(4,get_tensor_length(inputs) - 1),
572
- early_stopping=True,
573
- max_length=max( get_tensor_length(inputs) // 4 , max_len ),
574
- min_length=min_len
575
- )
576
- inputs = torch.tensor([[*list(outputs[0]), *list(inputs[0][512:])]])
577
- i = i + 1
578
- summary = tokenizer.decode(inputs[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
579
  log(f'RET summarize with summary as {summary}')
580
  return summary
581
 
@@ -633,6 +654,64 @@ def all_pipes(pos,neg,artist,song):
633
 
634
  return imgs
635
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
636
  def translate(txt,to_lang="en",from_lang=False):
637
  log(f'CALL translate')
638
  if not from_lang:
@@ -640,12 +719,13 @@ def translate(txt,to_lang="en",from_lang=False):
640
  if(from_lang == to_lang):
641
  log(f'RET translate with txt as {txt}')
642
  return txt
643
- inputs = tokenizer.encode(f"translate {from_lang} to {to_lang}: " + txt, return_tensors="pt", max_length=float('inf'), truncation=False)
644
- chunks_length = math.ceil(get_tensor_length(inputs) / 512)
645
  ret = ""
646
- for index in range(chunks_length):
647
- chunk = torch.tensor([list(inputs[0][ index*512:index*512+512 ])])
648
- gen = model.generate(chunk)
 
649
  toks = tokenizer.decode(gen[0], skip_special_tokens=True)
650
  ret = ret + ("" if ret == "" else " ") + toks
651
  log(f'RET translate with ret as {ret}')
 
558
  text, max_len=20, min_len=10
559
  ):
560
  log(f'CALL summarize')
561
+
562
+ words = text.split()
563
+
564
+ if get_tensor_length(words) < 5:
565
+ print("Summarization Error: Text is too short, 5 words minimum!")
566
  return text
567
+
568
+ prefix = "summarize: "
569
+ ret = ""
570
+
571
+ for index in math.ceil( len(words) / 512 ):
572
+
573
+ chunk = " ".join(words[ index*512:(index+1)*512 ])
574
+ inputs = tokenizer.encode( prefix + chunk, return_tensors="pt", max_length=float('inf'), truncation=False)
575
+
576
+ while get_tensor_length(inputs) > max_len:
577
+
578
+ inputs = model.generate(
579
+ inputs,
580
+ length_penalty=2.0,
581
+ num_beams=4,
582
+ early_stopping=True,
583
+ max_length=max( get_tensor_length(inputs) // 4 , max_len ),
584
+ min_length=min_len
585
+ )
586
+
587
+ toks = tokenizer.decode(inputs[0], skip_special_tokens=True)
588
+ ret = ret + ("" if ret == "" else " ") + toks
589
+
590
+ inputs = tokenizer.encode( prefix + ret, return_tensors="pt", max_length=float('inf'), truncation=False)
591
+ gen = model.generate(
592
+ inputs,
593
+ length_penalty=1.0,
594
+ num_beams=4,
595
+ early_stopping=True,
596
+ max_length=max_len,
597
+ min_length=min_len
598
+ )
599
+ summary = tokenizer.decode(gen[0], skip_special_tokens=True)
600
  log(f'RET summarize with summary as {summary}')
601
  return summary
602
 
 
654
 
655
  return imgs
656
 
657
+ language_codes = {
658
+ "af": "Afrikaans",
659
+ "ar": "Arabic",
660
+ "bg": "Bulgarian",
661
+ "bn": "Bengali",
662
+ "ca": "Catalan",
663
+ "cs": "Czech",
664
+ "cy": "Welsh",
665
+ "da": "Danish",
666
+ "de": "German",
667
+ "el": "Greek",
668
+ "en": "English",
669
+ "es": "Spanish",
670
+ "et": "Estonian",
671
+ "fa": "Persian (Farsi)",
672
+ "fi": "Finnish",
673
+ "fr": "French",
674
+ "gu": "Gujarati",
675
+ "he": "Hebrew",
676
+ "hi": "Hindi",
677
+ "hr": "Croatian",
678
+ "hu": "Hungarian",
679
+ "id": "Indonesian",
680
+ "it": "Italian",
681
+ "ja": "Japanese",
682
+ "kn": "Kannada",
683
+ "ko": "Korean",
684
+ "lt": "Lithuanian",
685
+ "lv": "Latvian",
686
+ "mk": "Macedonian",
687
+ "ml": "Malayalam",
688
+ "mr": "Marathi",
689
+ "ne": "Nepali",
690
+ "nl": "Dutch",
691
+ "no": "Norwegian",
692
+ "pa": "Punjabi",
693
+ "pl": "Polish",
694
+ "pt": "Portuguese",
695
+ "ro": "Romanian",
696
+ "ru": "Russian",
697
+ "sk": "Slovak",
698
+ "sl": "Slovenian",
699
+ "so": "Somali",
700
+ "sq": "Albanian",
701
+ "sv": "Swedish",
702
+ "sw": "Swahili",
703
+ "ta": "Tamil",
704
+ "te": "Telugu",
705
+ "th": "Thai",
706
+ "tl": "Tagalog (Filipino)",
707
+ "tr": "Turkish",
708
+ "uk": "Ukrainian",
709
+ "ur": "Urdu",
710
+ "vi": "Vietnamese",
711
+ "zh-cn": "Chinese (Simplified)",
712
+ "zh-tw": "Chinese (Traditional)",
713
+ }
714
+
715
  def translate(txt,to_lang="en",from_lang=False):
716
  log(f'CALL translate')
717
  if not from_lang:
 
719
  if(from_lang == to_lang):
720
  log(f'RET translate with txt as {txt}')
721
  return txt
722
+ prefix = f"translate {language_codes[from_lang]} to {language_codes[to_lang]}: "
723
+ words = txt.split()
724
  ret = ""
725
+ for index in math.ceil( len(words) / 512 ):
726
+ chunk = " ".join(words[ index*512:(index+1)*512 ])
727
+ inputs = tokenizer.encode(prefix+chunk, return_tensors="pt", max_length=float('inf'), truncation=False)
728
+ gen = model.generate(chunk,input)
729
  toks = tokenizer.decode(gen[0], skip_special_tokens=True)
730
  ret = ret + ("" if ret == "" else " ") + toks
731
  log(f'RET translate with ret as {ret}')