Update main with dev-sudipta
Browse files- .gitignore +134 -0
- app.py +149 -40
- data/amazon_reviews.json +9 -0
- data/imdb.json +9 -0
- data/tweet_eval.json +9 -0
- data/z_animal.csv +11 -0
- data/z_employee.csv +26 -0
- data/z_house.csv +7 -0
- methodologies.json +14 -0
- requirements.txt +6 -0
- scripts/genbit_metrics.py +48 -0
- scripts/gender_profession_tagging.py +129 -0
- scripts/gender_tagging.py +93 -0
- utils/config.json +160 -0
- utils/load_csv.py +23 -0
- utils/read_config.py +13 -0
.gitignore
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
testing/
|
4 |
+
flagged/
|
5 |
+
check_gender_tagging.py
|
6 |
+
*.py[cod]
|
7 |
+
*$py.class
|
8 |
+
|
9 |
+
# C extensions
|
10 |
+
*.so
|
11 |
+
|
12 |
+
# Distribution / packaging
|
13 |
+
.Python
|
14 |
+
build/
|
15 |
+
develop-eggs/
|
16 |
+
dist/
|
17 |
+
downloads/
|
18 |
+
eggs/
|
19 |
+
.eggs/
|
20 |
+
lib/
|
21 |
+
lib64/
|
22 |
+
parts/
|
23 |
+
sdist/
|
24 |
+
var/
|
25 |
+
wheels/
|
26 |
+
pip-wheel-metadata/
|
27 |
+
share/python-wheels/
|
28 |
+
*.egg-info/
|
29 |
+
.installed.cfg
|
30 |
+
*.egg
|
31 |
+
MANIFEST
|
32 |
+
|
33 |
+
# PyInstaller
|
34 |
+
# Usually these files are written by a python script from a template
|
35 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
36 |
+
*.manifest
|
37 |
+
*.spec
|
38 |
+
|
39 |
+
# Installer logs
|
40 |
+
pip-log.txt
|
41 |
+
pip-delete-this-directory.txt
|
42 |
+
|
43 |
+
# Unit test / coverage reports
|
44 |
+
htmlcov/
|
45 |
+
.tox/
|
46 |
+
.nox/
|
47 |
+
.coverage
|
48 |
+
.coverage.*
|
49 |
+
.cache
|
50 |
+
nosetests.xml
|
51 |
+
coverage.xml
|
52 |
+
*.cover
|
53 |
+
*.py,cover
|
54 |
+
.hypothesis/
|
55 |
+
.pytest_cache/
|
56 |
+
|
57 |
+
# Translations
|
58 |
+
*.mo
|
59 |
+
*.pot
|
60 |
+
|
61 |
+
# Django stuff:
|
62 |
+
*.log
|
63 |
+
local_settings.py
|
64 |
+
db.sqlite3
|
65 |
+
db.sqlite3-journal
|
66 |
+
|
67 |
+
# Flask stuff:
|
68 |
+
instance/
|
69 |
+
.webassets-cache
|
70 |
+
|
71 |
+
# Scrapy stuff:
|
72 |
+
.scrapy
|
73 |
+
|
74 |
+
# Sphinx documentation
|
75 |
+
docs/_build/
|
76 |
+
|
77 |
+
# PyBuilder
|
78 |
+
target/
|
79 |
+
|
80 |
+
# Jupyter Notebook
|
81 |
+
.ipynb_checkpoints
|
82 |
+
|
83 |
+
# IPython
|
84 |
+
profile_default/
|
85 |
+
ipython_config.py
|
86 |
+
|
87 |
+
# pyenv
|
88 |
+
.python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
98 |
+
__pypackages__/
|
99 |
+
|
100 |
+
# Celery stuff
|
101 |
+
celerybeat-schedule
|
102 |
+
celerybeat.pid
|
103 |
+
|
104 |
+
# SageMath parsed files
|
105 |
+
*.sage.py
|
106 |
+
|
107 |
+
# Environments
|
108 |
+
.env
|
109 |
+
.venv
|
110 |
+
env/
|
111 |
+
venv/
|
112 |
+
ENV/
|
113 |
+
env.bak/
|
114 |
+
venv.bak/
|
115 |
+
|
116 |
+
# Spyder project settings
|
117 |
+
.spyderproject
|
118 |
+
.spyproject
|
119 |
+
|
120 |
+
# Rope project settings
|
121 |
+
.ropeproject
|
122 |
+
|
123 |
+
# mkdocs documentation
|
124 |
+
/site
|
125 |
+
|
126 |
+
# mypy
|
127 |
+
.mypy_cache/
|
128 |
+
.dmypy.json
|
129 |
+
dmypy.json
|
130 |
+
|
131 |
+
# Pyre type checker
|
132 |
+
.pyre/
|
133 |
+
|
134 |
+
.idea
|
app.py
CHANGED
@@ -1,57 +1,166 @@
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
run_a(dataset_id)
|
8 |
-
elif methodology == 'B':
|
9 |
-
run_b(dataset_id)
|
10 |
-
elif methodology == 'C':
|
11 |
-
run_c(dataset_id)
|
12 |
-
|
13 |
|
14 |
-
demo = gr.Blocks(theme=gr.themes.Soft())
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
with gr.Row():
|
20 |
-
with gr.Column(scale=
|
21 |
-
gr.Markdown("
|
22 |
-
|
23 |
-
|
24 |
-
gr.Examples(
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
28 |
)
|
29 |
|
30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
-
|
|
|
|
|
33 |
|
34 |
with gr.Column(scale=4):
|
35 |
-
gr.Markdown("
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
methodology.change(
|
46 |
-
fn=
|
47 |
inputs=[methodology],
|
48 |
-
outputs=[
|
49 |
)
|
50 |
|
51 |
-
|
52 |
-
fn=
|
53 |
-
inputs=[
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
)
|
56 |
|
57 |
-
|
|
|
1 |
+
import json
|
2 |
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
import os
|
5 |
|
6 |
+
from scripts.genbit_metrics import *
|
7 |
+
from scripts.gender_profession_tagging import *
|
8 |
+
from scripts.gender_tagging import *
|
9 |
+
from utils.load_csv import *
|
10 |
+
from utils.read_config import get_args
|
11 |
|
12 |
+
methodologies = json.load(open("methodologies.json", "r"))
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
|
|
14 |
|
15 |
+
def get_methodology_metadata(methodology):
|
16 |
+
title = "## " + methodology
|
17 |
+
description = methodologies.get(methodology).get("description")
|
18 |
+
|
19 |
+
metadata = f"{title}\n\n{description}"
|
20 |
+
|
21 |
+
return gr.Markdown.update(metadata, visible=True)
|
22 |
+
|
23 |
+
|
24 |
+
def evaluate(dataset_file, dataset_scope, dataset_scope_n, dataset_column, methodology):
|
25 |
+
status = {}
|
26 |
+
dataset = pd.read_csv(dataset_file.name)
|
27 |
+
sample_method = dataset_scope
|
28 |
+
col_name = dataset_column
|
29 |
+
num_sample_records = dataset_scope_n
|
30 |
+
|
31 |
+
status = globals()[methodologies.get(methodology).get("fx")](
|
32 |
+
dataset, sample_method, col_name, num_sample_records
|
33 |
+
)
|
34 |
+
|
35 |
+
return gr.JSON.update(status, visible=True)
|
36 |
+
|
37 |
+
|
38 |
+
def process_dataset(dataset):
|
39 |
+
data = pd.read_csv(dataset.name)
|
40 |
+
|
41 |
+
columns = data.select_dtypes(include=["object"]).columns.tolist()
|
42 |
+
|
43 |
+
return (
|
44 |
+
gr.Radio.update(
|
45 |
+
label="Scope",
|
46 |
+
info="Determines the scope of the dataset to be analyzed",
|
47 |
+
choices=["First", "Last", "Random"],
|
48 |
+
value="First",
|
49 |
+
visible=True,
|
50 |
+
interactive=True,
|
51 |
+
),
|
52 |
+
gr.Slider.update(
|
53 |
+
label=f"Number of Entries",
|
54 |
+
info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {get_args('first_records')}.",
|
55 |
+
minimum=1,
|
56 |
+
maximum=min(data.shape[0], get_args("first_records")),
|
57 |
+
value=min(data.shape[0], get_args("first_records")) // 2,
|
58 |
+
visible=True,
|
59 |
+
interactive=True,
|
60 |
+
),
|
61 |
+
gr.Radio.update(
|
62 |
+
label="Column",
|
63 |
+
info="Determines the column to be analyzed. These are the columns with text data.",
|
64 |
+
choices=columns,
|
65 |
+
value=columns[0],
|
66 |
+
visible=True,
|
67 |
+
interactive=True,
|
68 |
+
),
|
69 |
+
)
|
70 |
+
|
71 |
+
|
72 |
+
def get_column_metadata(dataset, column):
|
73 |
+
data = pd.read_csv(dataset.name)
|
74 |
+
corpus = data[column].head(10).tolist()
|
75 |
+
|
76 |
+
return gr.Dataframe.update(
|
77 |
+
value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
|
78 |
+
)
|
79 |
+
|
80 |
+
|
81 |
+
BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")
|
82 |
+
|
83 |
+
with BiasAware:
|
84 |
+
gr.Markdown(
|
85 |
+
"# BiasAware: Dataset Bias Detection\n\nBiasAware is a specialized tool for detecting and quantifying biases within datasets used for Natural Language Processing (NLP) tasks. NLP training datasets frequently mirror the inherent biases of their source materials, resulting in AI models that unintentionally perpetuate stereotypes, exhibit underrepresentation, and showcase skewed perspectives."
|
86 |
+
)
|
87 |
+
|
88 |
with gr.Row():
|
89 |
+
with gr.Column(scale=2):
|
90 |
+
gr.Markdown("## Dataset")
|
91 |
+
|
92 |
+
dataset_file = gr.File(label="Dataset")
|
93 |
+
dataset_examples = gr.Examples(
|
94 |
+
[
|
95 |
+
os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
|
96 |
+
os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
|
97 |
+
os.path.join(os.path.dirname(__file__), "data/z_house.csv"),
|
98 |
+
],
|
99 |
+
inputs=dataset_file,
|
100 |
+
label="Example Datasets",
|
101 |
)
|
102 |
|
103 |
+
dataset_scope = gr.Radio(visible=False)
|
104 |
+
dataset_scope_n = gr.Slider(visible=False)
|
105 |
+
dataset_column = gr.Radio(visible=False)
|
106 |
+
|
107 |
+
dataset_corpus = gr.Dataframe(
|
108 |
+
row_count=(5, "fixed"), col_count=(1, "fixed"), visible=False
|
109 |
+
)
|
110 |
+
|
111 |
+
with gr.Column(scale=2):
|
112 |
+
gr.Markdown("## Methodology")
|
113 |
+
|
114 |
+
methodology = gr.Radio(
|
115 |
+
label="Methodology",
|
116 |
+
info="Determines the methodology to be used for bias detection",
|
117 |
+
choices=[
|
118 |
+
"Gender Divide (Term Identity Diversity)",
|
119 |
+
"Gender Profession Bias (Lexical Evaluation)",
|
120 |
+
"GenBiT (Microsoft Responsible AI Gender Bias Tool)",
|
121 |
+
],
|
122 |
+
)
|
123 |
|
124 |
+
evalButton = gr.Button("Run Evaluation")
|
125 |
+
|
126 |
+
methodology_metadata = gr.Markdown(visible=False)
|
127 |
|
128 |
with gr.Column(scale=4):
|
129 |
+
gr.Markdown("## Result")
|
130 |
+
|
131 |
+
result_status = gr.JSON(visible=False)
|
132 |
+
result = gr.DataFrame(
|
133 |
+
row_count=(5, "fixed"), col_count=(3, "fixed"), visible=False
|
134 |
+
)
|
135 |
+
|
136 |
+
dataset_file.change(
|
137 |
+
fn=process_dataset,
|
138 |
+
inputs=[dataset_file],
|
139 |
+
outputs=[dataset_scope, dataset_scope_n, dataset_column],
|
140 |
+
)
|
141 |
+
|
142 |
+
dataset_column.change(
|
143 |
+
fn=get_column_metadata,
|
144 |
+
inputs=[dataset_file, dataset_column],
|
145 |
+
outputs=[dataset_corpus],
|
146 |
+
)
|
147 |
+
|
148 |
methodology.change(
|
149 |
+
fn=get_methodology_metadata,
|
150 |
inputs=[methodology],
|
151 |
+
outputs=[methodology_metadata],
|
152 |
)
|
153 |
|
154 |
+
evalButton.click(
|
155 |
+
fn=evaluate,
|
156 |
+
inputs=[
|
157 |
+
dataset_file,
|
158 |
+
dataset_scope,
|
159 |
+
dataset_scope_n,
|
160 |
+
dataset_column,
|
161 |
+
methodology,
|
162 |
+
],
|
163 |
+
outputs=[result_status],
|
164 |
)
|
165 |
|
166 |
+
BiasAware.launch()
|
data/amazon_reviews.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gender" : 14500,
|
3 |
+
"no gender" : 195500,
|
4 |
+
"equal gender" : 253,
|
5 |
+
"female pg" : 125,
|
6 |
+
"male pg" : 117,
|
7 |
+
"female spg" : 7196,
|
8 |
+
"male spg" : 6809
|
9 |
+
}
|
data/imdb.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gender" : 36174,
|
3 |
+
"no gender" : 13826,
|
4 |
+
"equal gender" : 2160,
|
5 |
+
"female pg" : 2776,
|
6 |
+
"male pg" : 3440,
|
7 |
+
"female spg" : 6918,
|
8 |
+
"male spg" : 20880
|
9 |
+
}
|
data/tweet_eval.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"gender" : 10247,
|
3 |
+
"no gender" : 49652,
|
4 |
+
"equal gender" : 141,
|
5 |
+
"female pg" : 37,
|
6 |
+
"male pg" : 42,
|
7 |
+
"female spg" : 2478,
|
8 |
+
"male spg" : 7549
|
9 |
+
}
|
data/z_animal.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
|
2 |
+
1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
|
3 |
+
2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
|
4 |
+
3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
|
5 |
+
4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
|
6 |
+
5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
|
7 |
+
6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
|
8 |
+
7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
|
9 |
+
8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
|
10 |
+
9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
|
11 |
+
10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
|
data/z_employee.csv
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
EmployeeID,FirstName,LastName,Email,Department,Salary
|
2 |
+
101,John,Smith,[email protected],Finance,60000
|
3 |
+
102,Emily,Johnson,[email protected],Marketing,55000
|
4 |
+
103,Michael,Williams,[email protected],HR,50000
|
5 |
+
104,Susan,Anderson,[email protected],IT,65000
|
6 |
+
105,David,Martin,[email protected],Sales,58000
|
7 |
+
106,Linda,Davis,[email protected],Finance,62000
|
8 |
+
107,William,Miller,[email protected],Marketing,56000
|
9 |
+
108,Sarah,Anderson,[email protected],HR,51000
|
10 |
+
109,Robert,Clark,[email protected],IT,67000
|
11 |
+
110,Karen,Wilson,[email protected],Sales,59000
|
12 |
+
111,James,Brown,[email protected],Finance,61000
|
13 |
+
112,Anna,Johnson,[email protected],Marketing,57000
|
14 |
+
113,Christopher,Moore,[email protected],HR,52000
|
15 |
+
114,Laura,White,[email protected],IT,68000
|
16 |
+
115,Mark,Davis,[email protected],Sales,60000
|
17 |
+
116,Patricia,Jones,[email protected],Finance,63000
|
18 |
+
117,Matthew,Taylor,[email protected],Marketing,58000
|
19 |
+
118,Jennifer,Young,[email protected],HR,53000
|
20 |
+
119,Steven,Anderson,[email protected],IT,69000
|
21 |
+
120,Elizabeth,Thomas,[email protected],Sales,61000
|
22 |
+
121,Kevin,Harris,[email protected],Finance,64000
|
23 |
+
122,Deborah,Smith,[email protected],Marketing,59000
|
24 |
+
123,Joseph,Walker,[email protected],HR,54000
|
25 |
+
124,Cynthia,Jackson,[email protected],IT,70000
|
26 |
+
125,Daniel,Hall,[email protected],Sales,62000
|
data/z_house.csv
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
|
2 |
+
1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
|
3 |
+
2,456 Elm St,New York,NY,10001,2,1,1200,750000
|
4 |
+
3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
|
5 |
+
4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
|
6 |
+
5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
|
7 |
+
6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000
|
methodologies.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"Gender Divide (Term Identity Diversity)": {
|
3 |
+
"description": "333",
|
4 |
+
"fx": "load_dataset_and_analyze_gender_tag"
|
5 |
+
},
|
6 |
+
"Gender Profession Bias (Lexical Evaluation)": {
|
7 |
+
"description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
|
8 |
+
"fx": "load_dataset_and_analyze_gender_profession"
|
9 |
+
},
|
10 |
+
"GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
|
11 |
+
"description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
|
12 |
+
"fx": "load_dataset_and_get_genbit_metrics"
|
13 |
+
}
|
14 |
+
}
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.40.1
|
2 |
+
gradio_client==0.5.0
|
3 |
+
numpy==1.25.2
|
4 |
+
pandas==2.0.3
|
5 |
+
spacy
|
6 |
+
genbit
|
scripts/genbit_metrics.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from genbit.genbit_metrics import GenBitMetrics
|
2 |
+
import pandas as pd
|
3 |
+
from utils.read_config import get_args
|
4 |
+
from utils.load_csv import load_sample
|
5 |
+
|
6 |
+
|
7 |
+
def cal_metrics(dataset):
|
8 |
+
# Create a GenBit object with the desired settings:
|
9 |
+
|
10 |
+
genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
|
11 |
+
|
12 |
+
# Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
|
13 |
+
#dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
|
14 |
+
|
15 |
+
genbit_metrics_object.add_data(dataset, tokenized=False)
|
16 |
+
|
17 |
+
|
18 |
+
# To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
|
19 |
+
metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
|
20 |
+
|
21 |
+
return metrics
|
22 |
+
|
23 |
+
|
24 |
+
# Function to extract genbit metrics
|
25 |
+
def extract_genbit_metris(stats):
|
26 |
+
metrics = {}
|
27 |
+
metrics["genbit_score"] = str(stats["genbit_score"])
|
28 |
+
metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
|
29 |
+
metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
|
30 |
+
metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
|
31 |
+
metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
|
32 |
+
metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
|
33 |
+
metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
|
34 |
+
|
35 |
+
return metrics
|
36 |
+
|
37 |
+
def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
|
38 |
+
|
39 |
+
|
40 |
+
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
41 |
+
|
42 |
+
# Turn into a list of text.
|
43 |
+
sample_text = sample_df[col_name].tolist()
|
44 |
+
|
45 |
+
# Call cal_metrics function
|
46 |
+
stats = cal_metrics(sample_text)
|
47 |
+
metrics = extract_genbit_metris(stats)
|
48 |
+
return metrics
|
scripts/gender_profession_tagging.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import re
|
3 |
+
import spacy
|
4 |
+
from spacy.lang.en import English
|
5 |
+
import time
|
6 |
+
from tqdm import tqdm
|
7 |
+
import multiprocessing.pool
|
8 |
+
|
9 |
+
import warnings
|
10 |
+
warnings.filterwarnings("ignore")
|
11 |
+
from utils.read_config import get_args
|
12 |
+
from utils.load_csv import load_sample
|
13 |
+
|
14 |
+
|
15 |
+
# For sentence split
|
16 |
+
nlp = English()
|
17 |
+
nlp.add_pipe("sentencizer")
|
18 |
+
|
19 |
+
# Function to split sentences
|
20 |
+
def get_split_text(text):
|
21 |
+
|
22 |
+
doc = nlp(text)
|
23 |
+
sentences = [sent for sent in doc.sents]
|
24 |
+
return sentences
|
25 |
+
|
26 |
+
def get_gender_prof_match_details(df_text):
|
27 |
+
|
28 |
+
# Get args from config file
|
29 |
+
male_pronoun = get_args("male_pronoun")
|
30 |
+
female_pronoun = get_args("female_pronoun")
|
31 |
+
professions = get_args("professions")
|
32 |
+
|
33 |
+
# Get regex pattern
|
34 |
+
male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
|
35 |
+
|
36 |
+
|
37 |
+
split_text = get_split_text(df_text)
|
38 |
+
|
39 |
+
results = []
|
40 |
+
|
41 |
+
for text in split_text:
|
42 |
+
male_pronoun_match = re.findall(male_pronoun_pat, str(text))
|
43 |
+
female_pronoun_match = re.findall(female_pronoun_pat, str(text))
|
44 |
+
|
45 |
+
prof_match = re.findall(professions_pat, str(text))
|
46 |
+
|
47 |
+
both_match = "No"
|
48 |
+
|
49 |
+
if len(male_pronoun_match) != 0 and len(prof_match) != 0:
|
50 |
+
both_match = "Yes"
|
51 |
+
|
52 |
+
if len(female_pronoun_match) != 0 and len(prof_match) != 0:
|
53 |
+
both_match = "Yes"
|
54 |
+
|
55 |
+
# Unpack from list
|
56 |
+
male_pronoun_match = ",".join(male_pronoun_match)
|
57 |
+
female_pronoun_match = ",".join(female_pronoun_match)
|
58 |
+
|
59 |
+
prof_match = ",".join(prof_match)
|
60 |
+
|
61 |
+
results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
|
62 |
+
|
63 |
+
return results
|
64 |
+
|
65 |
+
# Function to call multiprocessing threadpool
|
66 |
+
def call_multiprocessing_pool(df_text):
|
67 |
+
concurrent = 2000
|
68 |
+
pool = multiprocessing.pool.ThreadPool(processes=concurrent)
|
69 |
+
result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
|
70 |
+
pool.close()
|
71 |
+
|
72 |
+
# return_list is nested -- we need to flatten it
|
73 |
+
flat_return_list = [item for sublist in result_list for item in sublist]
|
74 |
+
|
75 |
+
# add column names
|
76 |
+
cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
|
77 |
+
return_df = pd.DataFrame(flat_return_list, columns=cols)
|
78 |
+
|
79 |
+
return return_df
|
80 |
+
|
81 |
+
# Function to get statistics
|
82 |
+
def get_statistics(results_df):
|
83 |
+
count_total_sentence = results_df.shape[0]
|
84 |
+
count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
|
85 |
+
count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
|
86 |
+
count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
|
87 |
+
|
88 |
+
count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
|
89 |
+
count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
|
90 |
+
|
91 |
+
return{
|
92 |
+
"total_sentence" : str(count_total_sentence),
|
93 |
+
"both_gender_prof_match" : str(count_both_match),
|
94 |
+
"count_male_pronoun" : str(count_male_pronoun),
|
95 |
+
"count_female_pronoun" : str(count_female_pronoun),
|
96 |
+
"count_male_pronoun_profession" : str(count_male_pronoun_profession),
|
97 |
+
"count_female_pronoun_profession" : str(count_female_pronoun_profession)
|
98 |
+
}
|
99 |
+
|
100 |
+
# Function to return regular expression patterns
|
101 |
+
def get_regex_pattern(male_pronoun, female_pronoun, professions):
|
102 |
+
|
103 |
+
|
104 |
+
male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
|
105 |
+
female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
|
106 |
+
|
107 |
+
#Lower case male professioon
|
108 |
+
professions = [prof.lower() for prof in professions]
|
109 |
+
professions_pat = r'\b({})\b'.format("|".join(professions))
|
110 |
+
|
111 |
+
return male_pronoun_pat, female_pronoun_pat, professions_pat
|
112 |
+
|
113 |
+
|
114 |
+
def load_dataset_and_analyze_gender_profession(df, sample_method, col_name, num_sample_records):
|
115 |
+
# Get args from config file
|
116 |
+
|
117 |
+
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
118 |
+
|
119 |
+
|
120 |
+
# Lowercase of text
|
121 |
+
sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
|
122 |
+
|
123 |
+
# Call multiple threadpool
|
124 |
+
results_df = call_multiprocessing_pool(sample_df[col_name])
|
125 |
+
|
126 |
+
stats = get_statistics(results_df)
|
127 |
+
|
128 |
+
# Get statistics
|
129 |
+
return stats
|
scripts/gender_tagging.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import required libraries
|
2 |
+
import pandas as pd
|
3 |
+
import re
|
4 |
+
from utils.read_config import get_args
|
5 |
+
from utils.load_csv import load_sample
|
6 |
+
|
7 |
+
# Function to get count of male terms in text
|
8 |
+
def count_male_terms(text, male_terms):
|
9 |
+
# Get pattern
|
10 |
+
pattern = r"\b({})\b".format("|".join(male_terms))
|
11 |
+
match = re.findall(pattern, str(text))
|
12 |
+
return len(match)
|
13 |
+
|
14 |
+
# Function to get count of female terms in text
|
15 |
+
def count_female_terms(text, female_terms):
|
16 |
+
# Get pattern
|
17 |
+
pattern = r"\b({})\b".format("|".join(female_terms))
|
18 |
+
match = re.findall(pattern, str(text))
|
19 |
+
return len(match)
|
20 |
+
|
21 |
+
# Function to get gender tag categories
|
22 |
+
def get_gender_tag(count_m_term, count_f_term):
|
23 |
+
tag = ''
|
24 |
+
if count_m_term == 0 and count_f_term == 0:
|
25 |
+
tag = "No Gender"
|
26 |
+
|
27 |
+
elif count_m_term == count_f_term:
|
28 |
+
tag = "Equal Gender"
|
29 |
+
|
30 |
+
elif count_m_term > count_f_term:
|
31 |
+
m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
|
32 |
+
if m_proportion >= 50 and m_proportion < 75:
|
33 |
+
tag = "Male Positive Gender"
|
34 |
+
elif m_proportion >= 75:
|
35 |
+
tag = "Male Strongly Positive Gender"
|
36 |
+
|
37 |
+
elif count_m_term < count_f_term:
|
38 |
+
f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
|
39 |
+
if f_proportion >= 50 and f_proportion < 75:
|
40 |
+
tag = "Female Positive Gender"
|
41 |
+
elif f_proportion >= 75:
|
42 |
+
tag = "Female Strongly Positive Gender"
|
43 |
+
|
44 |
+
return tag
|
45 |
+
|
46 |
+
|
47 |
+
# Function to calculate PG and SPG
|
48 |
+
def get_pg_spg(sample_df):
|
49 |
+
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
|
50 |
+
|
51 |
+
count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
|
52 |
+
count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
|
53 |
+
|
54 |
+
count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
|
55 |
+
count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
|
56 |
+
|
57 |
+
count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
|
58 |
+
count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
|
59 |
+
|
60 |
+
return {
|
61 |
+
"gender" : str(count_gender_sentences),
|
62 |
+
"no gender" : str(count_no_gender_sentences),
|
63 |
+
"equal gender" : str(count_equal_gender),
|
64 |
+
"female pg" : str(count_female_pg),
|
65 |
+
"male pg" : str(count_male_pg),
|
66 |
+
"female spg" : str(count_female_spg),
|
67 |
+
"male spg" : str(count_male_spg)
|
68 |
+
}
|
69 |
+
|
70 |
+
# Function to load dataset and get the analysis done
|
71 |
+
def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
|
72 |
+
# Read config file
|
73 |
+
male_terms = get_args("male_terms")
|
74 |
+
female_terms = get_args("female_terms")
|
75 |
+
# Load sample
|
76 |
+
sample_df = load_sample(num_sample_records, sample_method, df, col_name)
|
77 |
+
|
78 |
+
# Lowercase of text
|
79 |
+
sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
|
80 |
+
|
81 |
+
# Get new columns of count - male terms and female terms
|
82 |
+
sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
|
83 |
+
sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
|
84 |
+
|
85 |
+
# Get tag categories
|
86 |
+
sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
|
87 |
+
|
88 |
+
# Get statistics
|
89 |
+
collection = get_pg_spg(sample_df)
|
90 |
+
return collection
|
91 |
+
|
92 |
+
|
93 |
+
|
utils/config.json
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"first_records" : 2000,
|
3 |
+
"random_seed" : 42,
|
4 |
+
"male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
|
5 |
+
"female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
|
6 |
+
"male_pronoun" : ["he", "him", "his"],
|
7 |
+
"female_pronoun" : ["she", "her", "hers"],
|
8 |
+
"professions" : ["Accountant",
|
9 |
+
"Actor",
|
10 |
+
"Actress",
|
11 |
+
"Aerospace Engineer",
|
12 |
+
"Agricultural Scientist",
|
13 |
+
"Air Traffic Controller",
|
14 |
+
"Aircraft Mechanic",
|
15 |
+
"Animator",
|
16 |
+
"Architect",
|
17 |
+
"Art Director",
|
18 |
+
"Attorney",
|
19 |
+
"Lawyer",
|
20 |
+
"Audiologist",
|
21 |
+
"Author",
|
22 |
+
"Writer",
|
23 |
+
"Baker",
|
24 |
+
"Barber",
|
25 |
+
"Hairdresser",
|
26 |
+
"Bartender",
|
27 |
+
"Biomedical Engineer",
|
28 |
+
"Botanist",
|
29 |
+
"Broadcast Journalist",
|
30 |
+
"Business Analyst",
|
31 |
+
"Carpenter",
|
32 |
+
"Chef",
|
33 |
+
"Cook",
|
34 |
+
"Chemist",
|
35 |
+
"Civil Engineer",
|
36 |
+
"Clinical Psychologist",
|
37 |
+
"Commercial Diver",
|
38 |
+
"Computer Programmer",
|
39 |
+
"Construction Worker",
|
40 |
+
"Corporate Trainer",
|
41 |
+
"Cosmetologist",
|
42 |
+
"Counselor",
|
43 |
+
"Therapist",
|
44 |
+
"Court Reporter",
|
45 |
+
"Creative Director",
|
46 |
+
"Criminologist",
|
47 |
+
"Customer Service Representative",
|
48 |
+
"Data Analyst",
|
49 |
+
"Dental Assistant",
|
50 |
+
"Dentist",
|
51 |
+
"Dermatologist",
|
52 |
+
"Dietician",
|
53 |
+
"Nutritionist",
|
54 |
+
"Doctor",
|
55 |
+
"Physician",
|
56 |
+
"Economist",
|
57 |
+
"Electrician",
|
58 |
+
"Elementary School Teacher",
|
59 |
+
"Emergency Medical Technician",
|
60 |
+
"Engineer",
|
61 |
+
"Environmental Scientist",
|
62 |
+
"Event Planner",
|
63 |
+
"Fashion Designer",
|
64 |
+
"Film Director",
|
65 |
+
"Financial Analyst",
|
66 |
+
"Firefighter",
|
67 |
+
"Fisherman",
|
68 |
+
"Fitness Trainer",
|
69 |
+
"Flight Attendant",
|
70 |
+
"Florist",
|
71 |
+
"Food Scientist",
|
72 |
+
"Forensic Scientist",
|
73 |
+
"Furniture Maker",
|
74 |
+
"Game Developer",
|
75 |
+
"Gardener",
|
76 |
+
"Landscaper",
|
77 |
+
"Geologist",
|
78 |
+
"Graphic Designer",
|
79 |
+
"Hair Stylist",
|
80 |
+
"Historian",
|
81 |
+
"Home Health Aide",
|
82 |
+
"Hotel Manager",
|
83 |
+
"Human Resources Manager",
|
84 |
+
"Immigration Lawyer",
|
85 |
+
"Industrial Designer",
|
86 |
+
"Insurance Agent",
|
87 |
+
"Interior Designer",
|
88 |
+
"Interpreter",
|
89 |
+
"Translator",
|
90 |
+
"Investment Banker",
|
91 |
+
"IT Specialist",
|
92 |
+
"Journalist",
|
93 |
+
"Judge",
|
94 |
+
"Kindergarten Teacher",
|
95 |
+
"Land Surveyor",
|
96 |
+
"Landscape Architect",
|
97 |
+
"Lawyer",
|
98 |
+
"Attorney",
|
99 |
+
"Librarian",
|
100 |
+
"Life Coach",
|
101 |
+
"Linguist",
|
102 |
+
"Makeup Artist",
|
103 |
+
"Management Consultant",
|
104 |
+
"Manufacturing Engineer",
|
105 |
+
"Marine Biologist",
|
106 |
+
"Marketing Manager",
|
107 |
+
"Massage Therapist",
|
108 |
+
"Mechanical Engineer",
|
109 |
+
"Medical Assistant",
|
110 |
+
"Medical Researcher",
|
111 |
+
"Meteorologist",
|
112 |
+
"Midwife",
|
113 |
+
"Military Officer",
|
114 |
+
"Music Producer",
|
115 |
+
"Musician",
|
116 |
+
"Nurse",
|
117 |
+
"Occupational Therapist",
|
118 |
+
"Optician",
|
119 |
+
"Optometrist",
|
120 |
+
"Paralegal",
|
121 |
+
"Paramedic",
|
122 |
+
"Patent Attorney",
|
123 |
+
"Pediatrician",
|
124 |
+
"Personal Trainer",
|
125 |
+
"Petroleum Engineer",
|
126 |
+
"Pharmacist",
|
127 |
+
"Photographer",
|
128 |
+
"Physical Therapist",
|
129 |
+
"Physician Assistant",
|
130 |
+
"Pilot",
|
131 |
+
"Plumber",
|
132 |
+
"Police Officer",
|
133 |
+
"Political Scientist",
|
134 |
+
"Preschool Teacher",
|
135 |
+
"Private Investigator",
|
136 |
+
"Product Manager",
|
137 |
+
"Professor",
|
138 |
+
"Lecturer",
|
139 |
+
"Programmer",
|
140 |
+
"Psychiatrist",
|
141 |
+
"Psychologist",
|
142 |
+
"Public Relations Specialist",
|
143 |
+
"Public School Teacher",
|
144 |
+
"Real Estate Agent",
|
145 |
+
"Broker",
|
146 |
+
"Receptionist",
|
147 |
+
"Registered Nurse",
|
148 |
+
"Reporter",
|
149 |
+
"Restaurant Manager",
|
150 |
+
"Sales Representative",
|
151 |
+
"School Counselor",
|
152 |
+
"Scientist",
|
153 |
+
"Screenwriter",
|
154 |
+
"Social Media Manager",
|
155 |
+
"Social Worker",
|
156 |
+
"Software Developer",
|
157 |
+
"Speech-Language Pathologist",
|
158 |
+
"Sports Coach",
|
159 |
+
"Statistician"]
|
160 |
+
}
|
utils/load_csv.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from utils.read_config import get_args
|
3 |
+
|
4 |
+
# Function to load sample of dataset
|
5 |
+
|
6 |
+
|
7 |
+
def load_sample(num_sample_records, sample_method, df, col_name):
|
8 |
+
|
9 |
+
sample_first_records = get_args("first_records")
|
10 |
+
sample_random_seed = get_args("random_seed")
|
11 |
+
|
12 |
+
num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
|
13 |
+
|
14 |
+
# Keep only required column
|
15 |
+
df = df[[col_name]]
|
16 |
+
if sample_method == "First":
|
17 |
+
df = df.iloc[:num_sample_records].copy().reset_index()
|
18 |
+
if sample_method == "Last":
|
19 |
+
df = df.iloc[-num_sample_records:].copy().reset_index()
|
20 |
+
if sample_method == "Random":
|
21 |
+
df = df.sample(num_sample_records,
|
22 |
+
random_state=sample_random_seed).copy().reset_index()
|
23 |
+
return df
|
utils/read_config.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
def read_config_file():
|
4 |
+
with open("utils/config.json", "r") as jsonfile:
|
5 |
+
data = json.load(jsonfile)
|
6 |
+
return data
|
7 |
+
|
8 |
+
def get_args(args):
|
9 |
+
try:
|
10 |
+
data = read_config_file()
|
11 |
+
except:
|
12 |
+
raise "Could not read config file."
|
13 |
+
return data[args]
|