sagar008 commited on
Commit
a166f8e
·
verified ·
1 Parent(s): fcc0ada

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -24
app.py CHANGED
@@ -5,24 +5,25 @@ import nltk
5
  import os
6
  import uvicorn
7
 
 
 
8
  nltk.download('punkt', quiet=True)
9
 
10
  app = FastAPI()
11
 
12
- HF_AUTH_TOKEN = os.getenv("HF_TOKEN")
13
 
14
  MODEL_NAME = "VincentMuriuki/legal-summarizer"
15
- summarizer = pipeline("summarization", model=MODEL_NAME, token=HF_AUTH_TOKEN)
16
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_AUTH_TOKEN)
17
 
18
  class SummarizeInput(BaseModel):
19
  text: str
20
 
21
  class ChunkInput(BaseModel):
22
  text: str
23
- max_tokens: int = 512 # Default chunk size
24
 
25
- # Summarize endpoint
26
  @app.post("/summarize")
27
  def summarize_text(data: SummarizeInput):
28
  summary = summarizer(data.text, max_length=150, min_length=30, do_sample=False)
@@ -30,25 +31,9 @@ def summarize_text(data: SummarizeInput):
30
 
31
  @app.post("/chunk")
32
  def chunk_text(data: ChunkInput):
33
- sentences = nltk.sent_tokenize(data.text)
34
- chunks = []
35
- current_chunk = ""
36
- current_token_count = 0
37
-
38
- for sentence in sentences:
39
- token_count = len(tokenizer.tokenize(sentence))
40
- if current_token_count + token_count > data.max_tokens:
41
- if current_chunk:
42
- chunks.append(current_chunk.strip())
43
- current_chunk = sentence
44
- current_token_count = token_count
45
- else:
46
- current_chunk = f"{current_chunk} {sentence}".strip()
47
- current_token_count += token_count
48
-
49
- if current_chunk:
50
- chunks.append(current_chunk.strip())
51
-
52
  return {"chunks": chunks}
 
53
  if __name__ == "__main__":
54
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
5
  import os
6
  import uvicorn
7
 
8
+ from chunker import chunk_by_token_limit
9
+
10
  nltk.download('punkt', quiet=True)
11
 
12
  app = FastAPI()
13
 
14
+ HF_AUTH_TOKEN = os.getenv("HF_TOKEN")
15
 
16
  MODEL_NAME = "VincentMuriuki/legal-summarizer"
17
+ summarizer = pipeline("summarization", model=MODEL_NAME, use_auth_token=HF_AUTH_TOKEN)
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=HF_AUTH_TOKEN)
19
 
20
  class SummarizeInput(BaseModel):
21
  text: str
22
 
23
  class ChunkInput(BaseModel):
24
  text: str
25
+ max_tokens: int = 1024
26
 
 
27
  @app.post("/summarize")
28
  def summarize_text(data: SummarizeInput):
29
  summary = summarizer(data.text, max_length=150, min_length=30, do_sample=False)
 
31
 
32
  @app.post("/chunk")
33
  def chunk_text(data: ChunkInput):
34
+ chunks = chunk_by_token_limit(data.text, data.max_tokens, tokenizer)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  return {"chunks": chunks}
36
+
37
  if __name__ == "__main__":
38
  uvicorn.run(app, host="0.0.0.0", port=7860)
39
+