Spaces:

Nainglinthu
/

Legal_Text_Classifier

Running

App Files Files Community

Legal_Text_Classifier / legaltextclassification.py

Nainglinthu

Upload legaltextclassification.py

3d56e9e verified 3 months ago

raw

history blame contribute delete

6.38 kB

	# -- coding: utf-8 --
	"""LegalTextClassification.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1x6EcLSN3qEgm6sVIcmX0bYeXj7AdDQlW
	!pip install gradio
	import gradio as gr

	def greet(name):
	return "Hello " + name + "!!"

	demo = gr.Interface(fn=greet, inputs="text", outputs="text")
	demo.launch()

	#About Data
	The dataset contains a total of 25000 legal cases in the form of text documents. Each document has been annotated with catchphrases, citations sentences, citation catchphrases, and citation classes. Citation classes indicate the type of treatment given to the cases cited by the present case.

	The Legal Citation Text Classification dataset is provided in CSV format. The dataset has *four columns, namely Case ID, Case Outcome, Case Title, and Case Text*. The Case ID column contains a unique identifier for each legal case, the Case Outcome column indicates the outcome of the case, the Case Title column contains the title of the legal case, and the Case Text column contains the text of the legal case.

	Kaggle Dataset Link: https://www.kaggle.com/datasets/amohankumar/legal-text-classification-dataset/data

	#Importing Data
	"""

	from google.colab import files
	import pandas as pd

	df = pd.read_csv('legal_text_classification.csv')
	df.head()

	"""#Data Preprocessing and Description"""

	print(df.columns) # Lists all column names
	print(len(df.columns)) # Shows the number of columns

	print(df.shape) # Output: (rows, columns)

	print(df.isnull().sum())

	df = df.dropna(subset=['case_text'])

	df = df.drop(columns=["case_id", "case_title"])

	print(df.isnull().sum())

	import re

	def text_ready(text):
	text = text.lower() #lowercase
	text = re.sub(r'[^\w\s]', '', text) #special char
	text = re.sub(r'\s+', ' ', text).strip() #whitespace
	return text

	df["text_ready"] = df["case_text"].apply(text_ready)

	import matplotlib.pyplot as plt

	text_data = df['text_ready']
	word_count = [len(text.split()) for text in text_data]

	plt.hist(word_count, bins=50, color='skyblue', edgecolor='black')
	plt.title('Distribution of Word Counts in text_ready')
	plt.xlabel('Word Count')
	plt.ylabel('Frequency')
	plt.show()

	print(df.shape) # Output: (rows, columns)

	df.describe()

	df['text']=df['text_ready']
	df['label']=df['case_outcome']
	data=df[['text','label']]

	df = df.drop(columns=["case_outcome", "case_text"])

	df.head()

	df = df.drop(columns=["text_ready"])

	df.head()

	data['label'].value_counts()

	class_label=sorted(data['label'].unique())
	lbl2id={label:id for id,label in enumerate(class_label)}
	id2lb={id:label for label,id in lbl2id.items()}
	print(lbl2id)
	print(id2lb)



	data.head()

	data['label']=data['label'].map(lbl2id)
	data.head()

	data.label.value_counts()

	import matplotlib.pyplot as plt

	df['label'].value_counts().plot.bar()
	plt.show()

	from transformers import AutoModelForSequenceClassification,AutoTokenizer
	model_name='nlpaueb/legal-bert-base-uncased'
	tokenizer=AutoTokenizer.from_pretrained(model_name)

	from transformers import AutoModelForSequenceClassification
	model = AutoModelForSequenceClassification.from_pretrained(
	model_name,
	num_labels=len(id2lb),
	id2label=id2lb,
	label2id=lbl2id
	)

	!pip install datasets
	from datasets import Dataset
	ds=Dataset.from_pandas(data)
	ds

	ds['label'][:11]

	from datasets import ClassLabel
	unique_labels = sorted(set(ds['label']))
	print(f"Unique labels in Y: {unique_labels}")

	new_features = ds.features.copy()
	new_features['label'] = ClassLabel(names=unique_labels)

	ds = ds.cast(new_features)
	data = ds.train_test_split(test_size=0.2, shuffle=True, seed=42)
	data

	split_ds = data['test'].remove_columns('__index_level_0__').train_test_split(test_size=0.5, shuffle=True, seed=42)
	split_ds

	train_data=data['train']
	test_data=split_ds['train']
	val_data=split_ds['test']

	train_data[0]

	def tokenize_fun(data):
	return tokenizer(data['text'],padding=True,truncation=True,return_tensors='pt')

	tokenized_train_data=train_data.map(tokenize_fun,batched=True)

	tokenized_train_data.features

	!pip install evaluate
	import evaluate
	accuracy=evaluate.load('accuracy')

	import numpy as np

	def compute_metrics(eval_pred):
	predictions, labels = eval_pred
	predictions = np.argmax(predictions, axis=1)
	return accuracy.compute(predictions=predictions, references=labels)

	tokenized_test_data=test_data.map(tokenize_fun,batched=True)
	tokenized_val_data=val_data.map(tokenize_fun,batched=True)

	from huggingface_hub import login
	login()

	from transformers import Trainer,TrainingArguments

	training_args=TrainingArguments(
	output_dir='./quest_model',
	learning_rate=2e-3,
	per_device_eval_batch_size=16,
	per_device_train_batch_size=16,
	num_train_epochs=2,
	weight_decay=0.01,
	eval_strategy='epoch',
	save_strategy='epoch',
	load_best_model_at_end=True,
	push_to_hub=True
	)

	trainer=Trainer(
	model=model,
	tokenizer=tokenizer,
	args=training_args,
	train_dataset=tokenized_train_data,
	eval_dataset=tokenized_val_data,
	compute_metrics=compute_metrics
	)
	trainer.train()

	model.config.id2label

	import os
	os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

	model.save_pretrained('./quest_model')
	tokenizer.save_pretrained("./quest_model")

	tokenized_train_data[0]['text']

	from transformers import pipeline
	pipe=pipeline('text-classification',model='Nainglinthu/quest_model')
	output=pipe('Hexal Australia Pty Ltd v Roche Therapeutics Inc (2005) 66 IPR 325, the likelihood of irreparable harm was regarded by Stone J as, indeed, a separate element that had to be established by an applicant for an interlocutory injunction.')
	output

	!pip install --upgrade gradio
	import gradio as gr
	from transformers import pipeline

	# Initialize the pipeline
	pipe = pipeline('text-classification', model='Nainglinthu/quest_model')

	# Function to classify text
	def classify_text(input_text):
	output = pipe(input_text)
	return output

	# Create Gradio interface
	interface = gr.Interface(
	fn=classify_text, # Function to call
	inputs="text", # Input type (text box)
	outputs="json", # Output type (JSON for displaying result)
	title="Legal Text Classifier", # Title of the Gradio app
	description="Classify legal text using the Nainglinthu/quest_model!", # Description
	)

	# Launch the Gradio app
	interface.launch()