Akbartus commited on
Commit
d10533c
·
verified ·
1 Parent(s): 38289a8

Update main2.py

Browse files
Files changed (1) hide show
  1. main2.py +98 -24
main2.py CHANGED
@@ -1,28 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
- from transformers import pipeline
3
-
4
- # Create a new FastAPI app instance
5
  app = FastAPI()
6
-
7
- # Initialize the text generation pipeline
8
- # This function will be able to generate text
9
- # given an input.
10
- summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
11
-
12
-
13
- # Define a function to handle the GET request at `/generate`
14
- # The generate() function is defined as a FastAPI route that takes a
15
- # string parameter called text. The function generates text based on the # input using the pipeline() object, and returns a JSON response
16
- # containing the generated text under the key "output"
17
- @app.get("/generate")
18
- def generate(text: str):
19
  """
20
- Using the text2text-generation pipeline from `transformers`, generate text
21
- from the given input text. The model used is `google/flan-t5-small`, which
22
- can be found [here](<https://huggingface.co/google/flan-t5-small>).
 
23
  """
24
- # Use the pipeline to generate text from the given input text
25
- output = summarizer(text, min_length=5, max_length=20)
26
-
27
- # Return the generated text in a JSON response
28
- return {"output": output[0]["generated_text"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Wed Jan 4 05:56:28 2023
4
+
5
+ @author: dreji18
6
+ """
7
+
8
+ # loading the packages
9
+ from rake_nltk import Rake
10
+ import wikipedia
11
+ from rank_bm25 import BM25Okapi
12
+ import torch
13
+ from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
14
  from fastapi import FastAPI
 
 
 
15
  app = FastAPI()
16
+
17
+ @app.get("/")
18
+ def read_root():
19
+ return {"Hello": "World"}
20
+
21
+ # keyword extraction function
22
+ def keyword_extractor(query):
 
 
 
 
 
 
23
  """
24
+ Rake has some features:
25
+ 1. convert automatically to lower case
26
+ 2. extract important key phrases
27
+ 3. it will extract combine words also (eg. Deep Learning, Capital City)
28
  """
29
+ r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.
30
+ r.extract_keywords_from_text(query)
31
+ keywords = r.get_ranked_phrases() # To get keyword phrases ranked highest to lowest.
32
+ return keywords
33
+
34
+ # data collection using wikepedia
35
+ def data_collection(search_words):
36
+ """wikipedia"""
37
+ search_query = ' '.join(search_words)
38
+ wiki_pages = wikipedia.search(search_query, results = 5)
39
+
40
+ information_list = []
41
+ pages_list = []
42
+ for i in wiki_pages:
43
+ try:
44
+ info = wikipedia.summary(i)
45
+ if any(word in info.lower() for word in search_words):
46
+ information_list.append(info)
47
+ pages_list.append(i)
48
+ except:
49
+ pass
50
+
51
+ original_info = information_list
52
+ information_list = [item[:1000] for item in information_list] # limiting the word len to 512
53
+
54
+ return information_list, pages_list, original_info
55
+
56
+ # document ranking function
57
+ def document_ranking(documents, query, n):
58
+ """BM25"""
59
+ try:
60
+ tokenized_corpus = [doc.split(" ") for doc in documents]
61
+ bm25 = BM25Okapi(tokenized_corpus)
62
+ tokenized_query = query.split(" ")
63
+ doc_scores = bm25.get_scores(tokenized_query)
64
+ datastore = bm25.get_top_n(tokenized_query, documents, n)
65
+ except:
66
+ pass
67
+ return datastore
68
+
69
+ def qna(context, question):
70
+ """DistilBert"""
71
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True)
72
+ model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', return_dict=False)
73
+ encoding = tokenizer.encode_plus(question, context)
74
+ input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
75
+ start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor([attention_mask]))
76
+ ans_tokens = input_ids[torch.argmax(start_scores) : torch.argmax(end_scores)+1]
77
+ answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens , skip_special_tokens=True)
78
+ answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
79
+
80
+ return answer_tokens_to_string
81
+
82
+ @app.get("/predict")
83
+ def answergen(search_string: str):
84
+ try:
85
+ keyword_list = keyword_extractor(search_string)
86
+ information, pages, original_data = data_collection(keyword_list)
87
+ datastore = document_ranking(information, search_string, 3)
88
+
89
+ answers_list = []
90
+ for i in range(len(datastore)):
91
+ result = qna(datastore[i], search_string)
92
+ answers_list.append(result)
93
+
94
+ return {"answer 1": answers_list[0],
95
+ "answer 2": answers_list[1],
96
+ "answer 3": answers_list[2]}
97
+ except:
98
+ return {"sorry couldn't process the request"}
99
+
100
+ #uvicorn app:app --port 8000 --reload
101
+
102
+ #%%