Spaces:
Runtime error
Runtime error
File size: 3,603 Bytes
cb39233 0133578 cb39233 2108b38 cb39233 0133578 cb39233 0133578 cb39233 83a908e cb39233 83a908e cb39233 fd5f0b9 4474dca e973cb5 fd5f0b9 e973cb5 fd5f0b9 e973cb5 c1a3ab2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
# -*- coding: utf-8 -*-
"""gradio_sindi.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/12KZGcYbsXlMWYC8U4aeR_Ex0u8fJLgly
# libraries
"""
import gradio as gr
import torch
from transformers import pipeline
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
#import re
"""# data - text"""
splitted_df = pd.read_csv('splitted_df_jo.csv')
"""# getting context"""
def remove_symbols(text):
remove_list = ['/', '(', ')', '\n', '.']
remove_chars = "".join(remove_list)
cleaned_text = "".join([char for char in text if char not in remove_chars])
# Remove non-ASCII characters
#pattern_ascii = r'[^\x00-\x7F]' # Matches any character outside the ASCII range
#filtered_text = re.sub(pattern_ascii, '', cleaned_text)
return cleaned_text
def context_func(message):
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()
# Convert abstracts and question to TF-IDF vectors
text_tfidf = vectorizer.fit_transform(splitted_df["section_text"])
question_tfidf = vectorizer.transform([message])
# Calculate cosine similarity between question and each abstract
similarities = cosine_similarity(question_tfidf, text_tfidf)[0]
# Find the index of the most similar abstract
most_similar_index = similarities.argmax()
# Get the most similar abstract
most_similar_context = splitted_df["section_text"][most_similar_index]
most_similar_context = remove_symbols(most_similar_context)
return most_similar_context
def answer_question(question):
context = context_func(question)
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
# Tokenize the inputs
inputs = tokenizer(question, context, return_tensors="pt", max_length=512, truncation=True)
# Get the answer from the model
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1
answer = tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
return answer, context
def main():
""""
Initializes a Women Cancer ChatBot interface using Hugging Face models for question answering.
This function loads a pretrained tokenizer and model from the Hugging Face model hub
and creates a Gradio interface for the ChatBot. Users can input questions related to
women's cancer topics, and the ChatBot will generate answers based on the provided context.
Returns:
None
Example:
>>> main()
"""
tokenizer = AutoTokenizer.from_pretrained("nlp-group/sindi-bert-final")
model = AutoModelForQuestionAnswering.from_pretrained("nlp-group/sindi-bert-final")
iface = gr.Interface(fn=answer_question,
inputs=["text"],
outputs=[gr.Textbox(label="Answer")],
title="Women Cancer ChatBot",
description="How can I help you?",
examples=[
["What is breast cancer?"],
["What are treatments for cervical cancer?"]
])
return iface.launch(debug = True, share=True)
if __name__ == "__main__":
main() |