UpendraAI commited on
Commit
412c845
·
verified ·
1 Parent(s): 593e05b

Upload 3 files

Browse files
Files changed (3) hide show
  1. SushasanSampleData.csv +3 -0
  2. app.py +66 -0
  3. requirements.txt +8 -0
SushasanSampleData.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ applicationId,applicationDetail,applicationCategoryName,applicationSubCategoryName,ulbName,wardName
2
+ 1,पानी की पाइपलाइन टूटी हुई है,शिकायत,जल आपूर्ति,नगर पालिका,वार्ड 1
3
+ 2,नई स्ट्रीट लाइट लगाने की मांग,मांग,बिजली व्यवस्था,नगर पालिका,वार्ड 2
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModel
6
+ from sklearn.linear_model import LogisticRegression
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.preprocessing import LabelEncoder
9
+ from imblearn.over_sampling import RandomOverSampler
10
+
11
+ @st.cache_resource
12
+ def load_model_and_tokenizer():
13
+ tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
14
+ model = AutoModel.from_pretrained("ai4bharat/indic-bert")
15
+ return tokenizer, model
16
+
17
+ def get_embeddings(texts, tokenizer, model):
18
+ inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
19
+ with torch.no_grad():
20
+ outputs = model(**inputs)
21
+ embeddings = outputs.last_hidden_state[:, 0, :] # CLS token
22
+ return embeddings
23
+
24
+ @st.cache_data
25
+ def load_data():
26
+ df = pd.read_csv("SushasanSampleData.csv", encoding="utf-8")
27
+ df['applicationDetail'] = df['applicationDetail'].fillna("")
28
+ df['applicationCategoryName'] = df['applicationCategoryName'].fillna("अन्य")
29
+ return df
30
+
31
+ @st.cache_resource
32
+ def preprocess_and_train(df):
33
+ tokenizer, model = load_model_and_tokenizer()
34
+ text_embeddings = get_embeddings(df['applicationDetail'].tolist(), tokenizer, model)
35
+ text_embeddings = text_embeddings.cpu().numpy()
36
+
37
+ label_encoder = LabelEncoder()
38
+ labels = label_encoder.fit_transform(df['applicationCategoryName'])
39
+
40
+ ros = RandomOverSampler(random_state=42)
41
+ X_resampled, y_resampled = ros.fit_resample(text_embeddings, labels)
42
+
43
+ X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
44
+
45
+ clf = LogisticRegression(max_iter=1000)
46
+ clf.fit(X_train, y_train)
47
+
48
+ return clf, tokenizer, model, label_encoder
49
+
50
+ df = load_data()
51
+ clf, tokenizer, model, label_encoder = preprocess_and_train(df)
52
+
53
+ # Streamlit UI
54
+ st.title("🇮🇳 Hindi Category Classifier (IndicBERT Powered)")
55
+
56
+ user_input = st.text_area("✍️ Enter Application Detail", "")
57
+
58
+ if st.button("🔍 Predict"):
59
+ if user_input.strip() == "":
60
+ st.warning("Please write something.")
61
+ else:
62
+ user_emb = get_embeddings([user_input], tokenizer, model)
63
+ user_emb = user_emb.cpu().numpy()
64
+ prediction = clf.predict(user_emb)
65
+ label = label_encoder.inverse_transform(prediction)[0]
66
+ st.success(f"🧠 Predicted Category: **{label}**")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+
2
+ streamlit
3
+ pandas
4
+ scikit-learn==1.3.2
5
+ imbalanced-learn==0.11.0
6
+ transformers
7
+ torch
8
+ sentencepiece