File size: 13,553 Bytes
8fa285a
1
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","mount_file_id":"1jjFDrzAM6yig2d1fheu73jR5blsurmSJ","authorship_tag":"ABX9TyP+cpVJRqyG/0VxKx66t7Gx"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","source":["## Train Spacy model and upload it to hugging face repo"],"metadata":{"id":"jSGFQUNTtDbU"}},{"source":["from google.colab import drive\n","drive.mount('/content/drive')\n","import os\n","os.chdir('/content/drive/MyDrive/medical_ner')"],"cell_type":"code","metadata":{"id":"Psa2bcm2iNUu"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!pip install -r requirements.txt -q"],"metadata":{"id":"2CY5C0aJhKQi"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DZTAhe6lazWv"},"outputs":[],"source":["import os\n","import kagglehub\n","import shutil\n","\n","os.chdir('/content/drive/MyDrive/medical_ner')\n","download_path = os.getcwd()\n","\n","dataset_path = kagglehub.dataset_download(\"finalepoch/medical-ner\")\n","destination_path = os.path.join(download_path, \"dataset\")\n","\n","if os.path.exists(destination_path):\n","    print(f\"Path '{destination_path}' already exists. Removing the existing directory...\")\n","    shutil.rmtree(destination_path)\n","\n","shutil.move(dataset_path, destination_path)\n","\n","print(\"Dataset moved to:\", destination_path)"]},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","\n","import matplotlib.pyplot as plt\n","import seaborn as sns\n","\n","import nltk\n","import spacy\n","from spacy.tokens import DocBin\n","from tqdm import tqdm"],"metadata":{"id":"6AvbE4cddWUx"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["dataset_path = \"dataset/Corona2.json\"\n","data = pd.read_json(dataset_path)\n","data.head()"],"metadata":{"id":"A0qCTk0RdXFa"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["list(data['examples'][0].keys())"],"metadata":{"id":"obPxOKDAdXS6"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data['examples'][0]['content']"],"metadata":{"id":"HJVKRu2WdXb4"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["data['examples'][0]['annotations'][0]"],"metadata":{"id":"MJrUNCMDeYBh"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["training_data = [{'text': example['content'],\n","                  'entities': [(annotation['start'], annotation['end'], annotation['tag_name'].upper())\n","                               for annotation in example['annotations']]}\n","                 for example in data['examples']]"],"metadata":{"id":"4CFh5Vyweygl"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["training_data[0]['entities']"],"metadata":{"id":"KMArJHrIe7fX"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Extract all unique entity labels from the training data\n","unique_labels = set()\n","for example in training_data:\n","    for _, _, label in example['entities']:\n","        unique_labels.add(label)\n","\n","print(\"Entities in the dataset:\", unique_labels)"],"metadata":{"id":"-ju4Njmi3kZF"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["training_data[0]['text'][563:571]\n"],"metadata":{"id":"QTm8_aRje-G1"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["nlp = spacy.blank(\"en\")\n","doc_bin = DocBin()\n","\n","from spacy.util import filter_spans\n","\n","for training_example in tqdm(training_data):\n","    text = training_example['text']\n","    labels = training_example['entities']\n","    doc = nlp.make_doc(text)\n","    ents = []\n","    for start, end, label in labels:\n","        span = doc.char_span(start, end, label=label, alignment_mode=\"contract\")\n","        if span is None:\n","            print(\"Skipping entity\")\n","        else:\n","            ents.append(span)\n","    filtered_ents = filter_spans(ents)\n","    doc.set_ents(filtered_ents)\n","    doc_bin.add(doc)\n","\n","doc_bin.to_disk(\"train.spacy\")\n","\n","! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency\n","\n","! python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy\n","\n","nlp_trained_model = spacy.load(\"model-best\")\n","\n","doc = nlp_trained_model('''\n","The patient was prescribed Aspirin for their heart condition.\n","The doctor recommended Ibuprofen to alleviate the patient's headache.\n","The patient is suffering from diabetes, and they need to take Metformin regularly.\n","After the surgery, the patient experienced some post-operative complications, including infection.\n","The patient is currently on a regimen of Lisinopril to manage their high blood pressure.\n","The antibiotic course for treating the bacterial infection should be completed as prescribed.\n","The patient's insulin dosage needs to be adjusted to better control their blood sugar levels.\n","The physician suspects that the patient may have pneumonia and has ordered a chest X-ray.\n","The patient's cholesterol levels are high, and they have been advised to take Atorvastatin.\n","The allergy to penicillin was noted in the patient's medical history.\n","''')\n","\n","colors = {\"PATHOGEN\": \"#F67DE3\", \"MEDICINE\": \"#7DF6D9\", \"MEDICALCONDITION\":\"#a6e22d\"}\n","options = {\"colors\": colors}\n","\n","spacy.displacy.render(doc, style=\"ent\", options= options, jupyter=True)"],"metadata":{"id":"a4mb_MpwfPd_"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"LT1MvoBEr7jW"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!pip install spacy-huggingface-hub -q"],"metadata":{"id":"6EcdTRtYk6pU"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!huggingface-cli login"],"metadata":{"id":"lRGHxjwEr-Jr"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!python -m spacy package \"model-best\" \"output\" --build wheel"],"metadata":{"id":"HT9em4Wkr_AX"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from spacy_huggingface_hub import push\n","\n","push(\"output/en_pipeline-0.0.0/dist/en_pipeline-0.0.0-py3-none-any.whl\")"],"metadata":{"id":"iE08sGHTVSoJ"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Download the model and use it."],"metadata":{"id":"xnZOxFSRuZAV"}},{"cell_type":"code","source":["!pip install https://huggingface.co/yuvarajareddy001/en_pipeline/resolve/main/en_pipeline-0.0.0-py3-none-any.whl -q\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6DD3GsPzbA9x","executionInfo":{"status":"ok","timestamp":1737494251528,"user_tz":300,"elapsed":5334,"user":{"displayName":"Yuvaraja Reddy Avuthu","userId":"13615256092999220370"}},"outputId":"f3cda6ed-64e5-464b-9f94-c7ca4d534abd"},"execution_count":8,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.7/5.7 MB\u001b[0m \u001b[31m21.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h"]}]},{"cell_type":"code","source":["# Using spacy.load().\n","import spacy\n","nlp = spacy.load(\"en_pipeline\")\n","\n","# Importing as module.\n","import en_pipeline\n","nlp = en_pipeline.load()"],"metadata":{"id":"P0afcvGEV_5q","executionInfo":{"status":"ok","timestamp":1737494261944,"user_tz":300,"elapsed":4153,"user":{"displayName":"Yuvaraja Reddy Avuthu","userId":"13615256092999220370"}}},"execution_count":9,"outputs":[]},{"cell_type":"code","source":["doc = nlp(\"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.\")\n","\n","colors = {\"PATHOGEN\": \"#F67DE3\", \"MEDICINE\": \"#7DF6D9\", \"MEDICALCONDITION\":\"#a6e22d\"}\n","options = {\"colors\": colors}\n","\n","spacy.displacy.render(doc, style=\"ent\", options= options, jupyter=True)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":122},"id":"1suw46uFa4Bj","executionInfo":{"status":"ok","timestamp":1737493761969,"user_tz":300,"elapsed":188,"user":{"displayName":"Yuvaraja Reddy Avuthu","userId":"13615256092999220370"}},"outputId":"f7ad8feb-abfc-48d4-d038-00e149ed65e9"},"execution_count":4,"outputs":[{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.HTML object>"],"text/html":["<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">While \n","<mark class=\"entity\" style=\"background: #7DF6D9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n","    bismuth compounds\n","    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">MEDICINE</span>\n","</mark>\n"," (\n","<mark class=\"entity\" style=\"background: #7DF6D9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n","    Pepto-Bismol\n","    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">MEDICINE</span>\n","</mark>\n",") decreased the number of bowel movements in those with travelers' \n","<mark class=\"entity\" style=\"background: #a6e22d; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n","    diarrhea\n","    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">MEDICALCONDITION</span>\n","</mark>\n",", they do not decrease the length of illness.[91] Anti-motility agents like \n","<mark class=\"entity\" style=\"background: #7DF6D9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n","    loperamide\n","    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">MEDICINE</span>\n","</mark>\n"," are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.</div></span>"]},"metadata":{}}]},{"cell_type":"code","source":[],"metadata":{"id":"bnI2Kz4CsxDs"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# # app.py\n","\n","# import gradio as gr\n","# import spacy\n","# import subprocess\n","\n","# # Install the model package from Hugging Face\n","# MODEL_PACKAGE = \"https://huggingface.co/yuvarajareddy001/en_pipeline/resolve/main/en_pipeline-0.0.0-py3-none-any.whl\"\n","\n","# subprocess.run([\"pip\", \"install\", MODEL_PACKAGE])\n","\n","# import spacy\n","# nlp = spacy.load(\"en_pipeline\")\n","\n","# # Importing as module.\n","# import en_pipeline\n","# nlp = en_pipeline.load()\n","\n","# # Function to process input text and display named entities\n","# def extract_entities(text):\n","#     doc = nlp(text)\n","#     colors = {\"PATHOGEN\": \"#F67DE3\", \"MEDICINE\": \"#7DF6D9\", \"MEDICALCONDITION\":\"#a6e22d\"}\n","#     options = {\"colors\": colors}\n","\n","#     return spacy.displacy.render(doc, style=\"ent\",options= options, jupyter=False)\n","\n","# # Gradio UI for Medical NER Model\n","# iface = gr.Interface(\n","#     fn=extract_entities,\n","#     inputs=gr.Textbox(lines=5, placeholder=\"Enter medical text here...\"),\n","#     outputs=\"html\",\n","#     title=\"🩺 Medical Named Entity Recognition (NER) Model\",\n","#     description=\"Enter medical text to extract entities such as **medical conditions, medications, and pathogens**.\",\n","#     examples=[\n","#         [\"\"\"John Doe, a 45-year-old man, visited the hospital after experiencing severe acute respiratory syndrome symptoms. The doctors suspected an infection caused by SARS coronavirus and initiated treatment using oseltamivir. Due to his history of asthma, he was monitored closely for any complications. He was advised to continue taking steroids for inflammation management. His condition improved within a week, and he was discharged with instructions to maintain social distancing.\"\"\"],\n","#         [\"\"\"While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.\"\"\"]\n","#     ],\n","#     theme=\"default\",\n","# )\n","\n","# # Launch the Gradio app\n","# if __name__ == \"__main__\":\n","#     iface.launch(share=True)"],"metadata":{"id":"QEuugMLxss4J","executionInfo":{"status":"ok","timestamp":1737494045316,"user_tz":300,"elapsed":181,"user":{"displayName":"Yuvaraja Reddy Avuthu","userId":"13615256092999220370"}}},"execution_count":6,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"x4KtZzjDtyDk","executionInfo":{"status":"ok","timestamp":1737494045524,"user_tz":300,"elapsed":2,"user":{"displayName":"Yuvaraja Reddy Avuthu","userId":"13615256092999220370"}}},"execution_count":6,"outputs":[]}]}