Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| os.system("pip install pdfminer.six rank_bm25 torch transformers") | |
| from gradio.mix import Series | |
| import re | |
| from rank_bm25 import BM25Okapi | |
| import string | |
| import torch | |
| from transformers import pipeline | |
| import pdfminer | |
| from pdfminer.high_level import extract_text | |
| #from termcolor import colored | |
| def read_pdf(file): | |
| text = extract_text(file.name) | |
| # Split text into smaller docs | |
| len_doc = 400 | |
| overlap = 50 | |
| docs = [] | |
| i = 0 | |
| while i < len(text): | |
| docs.append(text[i:i+len_doc]) | |
| i = i + len_doc - overlap | |
| return docs | |
| # We use BM25 as retriver which will do 1st round of candidate filtering based on word based matching | |
| def bm25_tokenizer(text): | |
| stop_w = ['a', 'the', 'am', 'is' , 'are', 'who', 'how', 'where', 'when', 'why'] | |
| tokenized_doc = [] | |
| for token in text.lower().split(): | |
| token = token.strip(string.punctuation) | |
| if len(token) > 0 and token not in stop_w: | |
| tokenized_doc.append(token) | |
| return tokenized_doc | |
| def retrieval(query, top_k_retriver, docs): | |
| bm25_scores = bm25.get_scores(bm25_tokenizer(query)) | |
| top_n = np.argsort(bm25_scores)[::-1][:top_k_retriver] | |
| bm25_hits = [{'corpus_id': idx, | |
| 'score': bm25_scores[idx], | |
| 'docs':docs[idx]} for idx in top_n if bm25_scores[idx] > 0] | |
| bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True) | |
| return bm25_hits | |
| qa_model = pipeline("question-answering", | |
| model = "deepset/roberta-base-squad2") | |
| def qa_ranker(query, docs_, top_k_ranker): | |
| ans = [] | |
| for doc in docs_: | |
| answer = qa_model(question = query, | |
| context = doc) | |
| answer['doc'] = doc | |
| ans.append(answer) | |
| return sorted(ans, key=lambda x: x['score'], reverse=True)[:top_k_ranker] | |
| def final_qa_pipeline(file, query): | |
| docs = read_pdf(file) | |
| tokenized_corpus = [] | |
| for doc in docs: | |
| tokenized_corpus.append(bm25_tokenizer(doc)) | |
| bm25 = BM25Okapi(tokenized_corpus) | |
| top_k_retriver, top_k_ranker = 10,1 | |
| lvl1 = retrieval(query, top_k_retriver, docs) | |
| if len(lvl1) > 0: | |
| fnl_rank = qa_ranker(query, [l["docs"] for l in lvl1], top_k_ranker) | |
| return (fnl_rank[0]["answer"], fnl_rank[0]["score"]) | |
| #for fnl_ in fnl_rank: | |
| # print("\n") | |
| # print_colored(fnl_['doc'], fnl_['start'], fnl_['end']) | |
| # print(colored("Confidence score of ") + colored(str(fnl_['score'])[:4], attrs=['bold'])) | |
| else: | |
| return ("No match", 0) | |
| iface = gr.Interface( | |
| fn = final_qa_pipeline, | |
| inputs = [gr.inputs.File(label="input pdf file"), gr.inputs.Textbox(label="Question:")], | |
| outputs = [gr.outputs.HTML(label="Answer"), gr.outputs.HTML(label="Score")] | |
| ) | |
| iface.launch() |