sagar008 commited on
Commit
c4eb084
Β·
verified Β·
1 Parent(s): 8967b9e

Create summarizer.py

Browse files
Files changed (1) hide show
  1. summarizer.py +88 -0
summarizer.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import List, Dict, Any
3
+ from transformers import pipeline, AutoTokenizer
4
+ import os
5
+
6
+ class DocumentSummarizer:
7
+ def __init__(self):
8
+ self.summarizer = None
9
+ self.tokenizer = None
10
+ self.model_name = "VincentMuriuki/legal-summarizer"
11
+
12
+ async def initialize(self):
13
+ """Initialize summarization pipeline"""
14
+ if self.summarizer is None:
15
+ print(f"πŸ€– Loading summarization model: {self.model_name}")
16
+ start_time = time.time()
17
+
18
+ hf_token = os.getenv("HF_TOKEN")
19
+ self.summarizer = pipeline(
20
+ "summarization",
21
+ model=self.model_name,
22
+ token=hf_token
23
+ )
24
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, token=hf_token)
25
+
26
+ print(f"βœ… Summarization model loaded in {time.time() - start_time:.2f}s")
27
+
28
+ async def batch_summarize(self, chunks: List[str]) -> Dict[str, Any]:
29
+ """Batch summarize all chunks"""
30
+ if not chunks:
31
+ return {"actual_summary": "", "short_summary": ""}
32
+
33
+ print(f"πŸ“ Batch summarizing {len(chunks)} chunks...")
34
+ start_time = time.time()
35
+
36
+ # Batch process all chunks at once
37
+ outputs = self.summarizer(
38
+ chunks,
39
+ max_length=128,
40
+ min_length=24,
41
+ do_sample=False,
42
+ num_beams=1,
43
+ truncation=True,
44
+ )
45
+
46
+ summaries = [output["summary_text"] for output in outputs]
47
+ combined_summary = " ".join(summaries)
48
+
49
+ # Optional: Create short summary if combined is too long
50
+ short_summary = combined_summary
51
+ if len(combined_summary) > 2000:
52
+ short_outputs = self.summarizer(
53
+ [combined_summary],
54
+ max_length=96,
55
+ min_length=16,
56
+ do_sample=False,
57
+ num_beams=1,
58
+ truncation=True,
59
+ )
60
+ short_summary = short_outputs[0]["summary_text"]
61
+
62
+ processing_time = time.time() - start_time
63
+ print(f"βœ… Batch summarization completed in {processing_time:.2f}s")
64
+
65
+ return {
66
+ "actual_summary": combined_summary,
67
+ "short_summary": short_summary,
68
+ "individual_summaries": summaries,
69
+ "time_taken": f"{processing_time:.2f}s"
70
+ }
71
+
72
+ def summarize_texts_sync(self, texts: List[str], max_length: int, min_length: int) -> Dict[str, Any]:
73
+ """Synchronous batch summarization for standalone endpoint"""
74
+ start_time = time.time()
75
+ outputs = self.summarizer(
76
+ texts,
77
+ max_length=max_length,
78
+ min_length=min_length,
79
+ do_sample=False,
80
+ num_beams=1,
81
+ truncation=True,
82
+ )
83
+ summaries = [output["summary_text"] for output in outputs]
84
+ return {
85
+ "summaries": summaries,
86
+ "count": len(summaries),
87
+ "time_taken": f"{time.time() - start_time:.2f}s"
88
+ }