sudipta002 commited on
Commit
f12d373
·
2 Parent(s): 11c7cac 0321f34

Update main with dev-sudipta

Browse files
.gitignore ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ testing/
4
+ flagged/
5
+ check_gender_tagging.py
6
+ *.py[cod]
7
+ *$py.class
8
+
9
+ # C extensions
10
+ *.so
11
+
12
+ # Distribution / packaging
13
+ .Python
14
+ build/
15
+ develop-eggs/
16
+ dist/
17
+ downloads/
18
+ eggs/
19
+ .eggs/
20
+ lib/
21
+ lib64/
22
+ parts/
23
+ sdist/
24
+ var/
25
+ wheels/
26
+ pip-wheel-metadata/
27
+ share/python-wheels/
28
+ *.egg-info/
29
+ .installed.cfg
30
+ *.egg
31
+ MANIFEST
32
+
33
+ # PyInstaller
34
+ # Usually these files are written by a python script from a template
35
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
36
+ *.manifest
37
+ *.spec
38
+
39
+ # Installer logs
40
+ pip-log.txt
41
+ pip-delete-this-directory.txt
42
+
43
+ # Unit test / coverage reports
44
+ htmlcov/
45
+ .tox/
46
+ .nox/
47
+ .coverage
48
+ .coverage.*
49
+ .cache
50
+ nosetests.xml
51
+ coverage.xml
52
+ *.cover
53
+ *.py,cover
54
+ .hypothesis/
55
+ .pytest_cache/
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98
+ __pypackages__/
99
+
100
+ # Celery stuff
101
+ celerybeat-schedule
102
+ celerybeat.pid
103
+
104
+ # SageMath parsed files
105
+ *.sage.py
106
+
107
+ # Environments
108
+ .env
109
+ .venv
110
+ env/
111
+ venv/
112
+ ENV/
113
+ env.bak/
114
+ venv.bak/
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
133
+
134
+ .idea
app.py CHANGED
@@ -1,57 +1,166 @@
 
1
  import gradio as gr
 
 
2
 
3
- def run_evaluation(dataset_id, methodology):
4
- return f'Running evaluation for {dataset_id} with {methodology}'
 
 
 
5
 
6
- if methodology == 'A':
7
- run_a(dataset_id)
8
- elif methodology == 'B':
9
- run_b(dataset_id)
10
- elif methodology == 'C':
11
- run_c(dataset_id)
12
-
13
 
14
- demo = gr.Blocks(theme=gr.themes.Soft())
15
 
16
- with demo:
17
- gr.Markdown("# BiasAware: Dataset Bias Detection")
18
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  with gr.Row():
20
- with gr.Column(scale=1):
21
- gr.Markdown("Select a dataset to analyze")
22
-
23
- dataset_id = gr.Text(label="Dataset")
24
- gr.Examples(
25
- examples=["imdb", "amazon_reviews_multi", "tweet_eval"],
26
- fn=run_evaluation,
27
- inputs=[dataset_id]
 
 
 
 
28
  )
29
 
30
- methodology = gr.Dropdown(["Term Identity Diversity Analysis", "Textual Gender Label Evaluation", "GenBit"], label="Methodology")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- button = gr.Button("Run Evaluation")
 
 
33
 
34
  with gr.Column(scale=4):
35
- gr.Markdown("### Results")
36
-
37
- with gr.Box():
38
- methodology_title = gr.Markdown("### Identity Term Sampling")
39
- methodology_description = gr.Markdown("lorem ipsum")
40
-
41
- methodology_test_description = gr.Markdown("lorem ipsum")
42
- outputs = gr.Markdown()
43
- gr.Error("No results to display")
44
-
 
 
 
 
 
 
 
 
 
45
  methodology.change(
46
- fn=lambda x: (f'### {x}', "lorem ipseum", "lorem ipsum"),
47
  inputs=[methodology],
48
- outputs=[methodology_title, methodology_description, methodology_test_description]
49
  )
50
 
51
- button.click(
52
- fn=run_evaluation,
53
- inputs=[dataset_id, methodology],
54
- outputs=[outputs]
 
 
 
 
 
 
55
  )
56
 
57
- demo.launch()
 
1
+ import json
2
  import gradio as gr
3
+ import pandas as pd
4
+ import os
5
 
6
+ from scripts.genbit_metrics import *
7
+ from scripts.gender_profession_tagging import *
8
+ from scripts.gender_tagging import *
9
+ from utils.load_csv import *
10
+ from utils.read_config import get_args
11
 
12
+ methodologies = json.load(open("methodologies.json", "r"))
 
 
 
 
 
 
13
 
 
14
 
15
+ def get_methodology_metadata(methodology):
16
+ title = "## " + methodology
17
+ description = methodologies.get(methodology).get("description")
18
+
19
+ metadata = f"{title}\n\n{description}"
20
+
21
+ return gr.Markdown.update(metadata, visible=True)
22
+
23
+
24
+ def evaluate(dataset_file, dataset_scope, dataset_scope_n, dataset_column, methodology):
25
+ status = {}
26
+ dataset = pd.read_csv(dataset_file.name)
27
+ sample_method = dataset_scope
28
+ col_name = dataset_column
29
+ num_sample_records = dataset_scope_n
30
+
31
+ status = globals()[methodologies.get(methodology).get("fx")](
32
+ dataset, sample_method, col_name, num_sample_records
33
+ )
34
+
35
+ return gr.JSON.update(status, visible=True)
36
+
37
+
38
+ def process_dataset(dataset):
39
+ data = pd.read_csv(dataset.name)
40
+
41
+ columns = data.select_dtypes(include=["object"]).columns.tolist()
42
+
43
+ return (
44
+ gr.Radio.update(
45
+ label="Scope",
46
+ info="Determines the scope of the dataset to be analyzed",
47
+ choices=["First", "Last", "Random"],
48
+ value="First",
49
+ visible=True,
50
+ interactive=True,
51
+ ),
52
+ gr.Slider.update(
53
+ label=f"Number of Entries",
54
+ info=f"Determines the number of entries to be analyzed. Due to computational constraints, the maximum number of entries that can be analyzed is {get_args('first_records')}.",
55
+ minimum=1,
56
+ maximum=min(data.shape[0], get_args("first_records")),
57
+ value=min(data.shape[0], get_args("first_records")) // 2,
58
+ visible=True,
59
+ interactive=True,
60
+ ),
61
+ gr.Radio.update(
62
+ label="Column",
63
+ info="Determines the column to be analyzed. These are the columns with text data.",
64
+ choices=columns,
65
+ value=columns[0],
66
+ visible=True,
67
+ interactive=True,
68
+ ),
69
+ )
70
+
71
+
72
+ def get_column_metadata(dataset, column):
73
+ data = pd.read_csv(dataset.name)
74
+ corpus = data[column].head(10).tolist()
75
+
76
+ return gr.Dataframe.update(
77
+ value=pd.DataFrame({f"Data Corpus: {column}": corpus}), visible=True
78
+ )
79
+
80
+
81
+ BiasAware = gr.Blocks(title="BiasAware: Dataset Bias Detection")
82
+
83
+ with BiasAware:
84
+ gr.Markdown(
85
+ "# BiasAware: Dataset Bias Detection\n\nBiasAware is a specialized tool for detecting and quantifying biases within datasets used for Natural Language Processing (NLP) tasks. NLP training datasets frequently mirror the inherent biases of their source materials, resulting in AI models that unintentionally perpetuate stereotypes, exhibit underrepresentation, and showcase skewed perspectives."
86
+ )
87
+
88
  with gr.Row():
89
+ with gr.Column(scale=2):
90
+ gr.Markdown("## Dataset")
91
+
92
+ dataset_file = gr.File(label="Dataset")
93
+ dataset_examples = gr.Examples(
94
+ [
95
+ os.path.join(os.path.dirname(__file__), "data/z_animal.csv"),
96
+ os.path.join(os.path.dirname(__file__), "data/z_employee.csv"),
97
+ os.path.join(os.path.dirname(__file__), "data/z_house.csv"),
98
+ ],
99
+ inputs=dataset_file,
100
+ label="Example Datasets",
101
  )
102
 
103
+ dataset_scope = gr.Radio(visible=False)
104
+ dataset_scope_n = gr.Slider(visible=False)
105
+ dataset_column = gr.Radio(visible=False)
106
+
107
+ dataset_corpus = gr.Dataframe(
108
+ row_count=(5, "fixed"), col_count=(1, "fixed"), visible=False
109
+ )
110
+
111
+ with gr.Column(scale=2):
112
+ gr.Markdown("## Methodology")
113
+
114
+ methodology = gr.Radio(
115
+ label="Methodology",
116
+ info="Determines the methodology to be used for bias detection",
117
+ choices=[
118
+ "Gender Divide (Term Identity Diversity)",
119
+ "Gender Profession Bias (Lexical Evaluation)",
120
+ "GenBiT (Microsoft Responsible AI Gender Bias Tool)",
121
+ ],
122
+ )
123
 
124
+ evalButton = gr.Button("Run Evaluation")
125
+
126
+ methodology_metadata = gr.Markdown(visible=False)
127
 
128
  with gr.Column(scale=4):
129
+ gr.Markdown("## Result")
130
+
131
+ result_status = gr.JSON(visible=False)
132
+ result = gr.DataFrame(
133
+ row_count=(5, "fixed"), col_count=(3, "fixed"), visible=False
134
+ )
135
+
136
+ dataset_file.change(
137
+ fn=process_dataset,
138
+ inputs=[dataset_file],
139
+ outputs=[dataset_scope, dataset_scope_n, dataset_column],
140
+ )
141
+
142
+ dataset_column.change(
143
+ fn=get_column_metadata,
144
+ inputs=[dataset_file, dataset_column],
145
+ outputs=[dataset_corpus],
146
+ )
147
+
148
  methodology.change(
149
+ fn=get_methodology_metadata,
150
  inputs=[methodology],
151
+ outputs=[methodology_metadata],
152
  )
153
 
154
+ evalButton.click(
155
+ fn=evaluate,
156
+ inputs=[
157
+ dataset_file,
158
+ dataset_scope,
159
+ dataset_scope_n,
160
+ dataset_column,
161
+ methodology,
162
+ ],
163
+ outputs=[result_status],
164
  )
165
 
166
+ BiasAware.launch()
data/amazon_reviews.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gender" : 14500,
3
+ "no gender" : 195500,
4
+ "equal gender" : 253,
5
+ "female pg" : 125,
6
+ "male pg" : 117,
7
+ "female spg" : 7196,
8
+ "male spg" : 6809
9
+ }
data/imdb.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gender" : 36174,
3
+ "no gender" : 13826,
4
+ "equal gender" : 2160,
5
+ "female pg" : 2776,
6
+ "male pg" : 3440,
7
+ "female spg" : 6918,
8
+ "male spg" : 20880
9
+ }
data/tweet_eval.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "gender" : 10247,
3
+ "no gender" : 49652,
4
+ "equal gender" : 141,
5
+ "female pg" : 37,
6
+ "male pg" : 42,
7
+ "female spg" : 2478,
8
+ "male spg" : 7549
9
+ }
data/z_animal.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AnimalID,CommonName,ScientificName,Class,Order,Family,Habitat,ConservationStatus
2
+ 1,Lion,Panthera leo,Mammalia,Carnivora,Felidae,Savanna,Vulnerable
3
+ 2,Eagle,Aquila chrysaetos,Aves,Accipitriformes,Accipitridae,Mountains,Least Concern
4
+ 3,Dolphin,Tursiops truncatus,Mammalia,Cetacea,Delphinidae,Ocean,Least Concern
5
+ 4,Elephant,Loxodonta africana,Mammalia,Proboscidea,Elephantidae,Grassland,Vulnerable
6
+ 5,Tiger,Panthera tigris,Mammalia,Carnivora,Felidae,Forest,Endangered
7
+ 6,Penguin,Spheniscidae,Aves,Sphenisciformes,Spheniscidae,Antarctica,Least Concern
8
+ 7,Giraffe,Giraffa camelopardalis,Mammalia,Artiodactyla,Giraffidae,Savanna,Vulnerable
9
+ 8,Cheetah,Acinonyx jubatus,Mammalia,Carnivora,Felidae,Grassland,Vulnerable
10
+ 9,Panda,Ailuropoda melanoleuca,Mammalia,Carnivora,Ursidae,Forest,Endangered
11
+ 10,Kangaroo,Macropus rufus,Mammalia,Diprotodontia,Macropodidae,Grassland,Least Concern
data/z_employee.csv ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EmployeeID,FirstName,LastName,Email,Department,Salary
2
+ 101,John,Smith,[email protected],Finance,60000
3
+ 102,Emily,Johnson,[email protected],Marketing,55000
4
+ 103,Michael,Williams,[email protected],HR,50000
5
+ 104,Susan,Anderson,[email protected],IT,65000
6
+ 105,David,Martin,[email protected],Sales,58000
7
+ 106,Linda,Davis,[email protected],Finance,62000
8
+ 107,William,Miller,[email protected],Marketing,56000
9
+ 108,Sarah,Anderson,[email protected],HR,51000
10
+ 109,Robert,Clark,[email protected],IT,67000
11
+ 110,Karen,Wilson,[email protected],Sales,59000
12
+ 111,James,Brown,[email protected],Finance,61000
13
+ 112,Anna,Johnson,[email protected],Marketing,57000
14
+ 113,Christopher,Moore,[email protected],HR,52000
15
+ 114,Laura,White,[email protected],IT,68000
16
+ 115,Mark,Davis,[email protected],Sales,60000
17
+ 116,Patricia,Jones,[email protected],Finance,63000
18
+ 117,Matthew,Taylor,[email protected],Marketing,58000
19
+ 118,Jennifer,Young,[email protected],HR,53000
20
+ 119,Steven,Anderson,[email protected],IT,69000
21
+ 120,Elizabeth,Thomas,[email protected],Sales,61000
22
+ 121,Kevin,Harris,[email protected],Finance,64000
23
+ 122,Deborah,Smith,[email protected],Marketing,59000
24
+ 123,Joseph,Walker,[email protected],HR,54000
25
+ 124,Cynthia,Jackson,[email protected],IT,70000
26
+ 125,Daniel,Hall,[email protected],Sales,62000
data/z_house.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ PropertyID,StreetAddress,City,State,ZipCode,NumberOfBedrooms,NumberOfBathrooms,SquareFootage,Price
2
+ 1,123 Main St,Los Angeles,CA,90001,3,2,1800,550000
3
+ 2,456 Elm St,New York,NY,10001,2,1,1200,750000
4
+ 3,789 Oak St,San Francisco,CA,94101,4,3,2500,950000
5
+ 4,101 Maple St,Boston,MA,02101,3,2.5,2000,680000
6
+ 5,202 Pine St,Miami,FL,33101,4,3.5,2700,820000
7
+ 6,303 Cedar St,Chicago,IL,60601,2,1,1100,450000
methodologies.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Gender Divide (Term Identity Diversity)": {
3
+ "description": "333",
4
+ "fx": "load_dataset_and_analyze_gender_tag"
5
+ },
6
+ "Gender Profession Bias (Lexical Evaluation)": {
7
+ "description": "This approach to addressing gender bias in language places a strong emphasis on a fundamental shift in detection and mitigation strategies.\n- Instead of solely relying on traditional frequency-based methods, this approach adopts a more nuanced perspective, prioritizing features within the text that consider contextual and semantic cues. It recognizes that gender bias extends beyond mere word frequency and delves into how language is structured and how it reinforces gender stereotypes.\n- Even with advanced models like Word Embedding and Contextual Word Embedding, which capture more complex language features, there's still a risk of inheriting biases from training data.\n- To tackle this, this approach advocates for a data-driven strategy, involving the collection and labeling of datasets encompassing various subtypes of bias, using a comprehensive taxonomy for precise categorization.",
8
+ "fx": "load_dataset_and_analyze_gender_profession"
9
+ },
10
+ "GenBiT (Microsoft Responsible AI Gender Bias Tool)": {
11
+ "description": "[GenBiT](https://www.microsoft.com/en-us/research/uploads/prod/2021/10/MSJAR_Genbit_Final_Version-616fd3a073758.pdf) is a versatile tool designed to address gender bias in language datasets by utilizing word co-occurrence statistical methods to measure bias. It introduces a novel approach to mitigating gender bias by combining contextual data augmentation, random sampling, sentence classification, and targeted gendered data filtering.\n- The primary goal is to reduce historical gender biases within conversational parallel multilingual datasets, ultimately enhancing the fairness and inclusiveness of machine learning model training and its subsequent applications.\n- What sets GenBiT apart is its adaptability to various forms of bias, not limited to gender alone. It can effectively address biases related to race, religion, or other dimensions, making it a valuable generic tool for bias mitigation in language datasets.\n- GenBiT's impact extends beyond bias reduction metrics; it has shown positive results in improving the performance of machine learning classifiers like Support Vector Machine(SVM). Augmented datasets produced by GenBiT yield significant enhancements in f1-score when compared to the original datasets, underlining its practical benefits in machine learning applications.",
12
+ "fx": "load_dataset_and_get_genbit_metrics"
13
+ }
14
+ }
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio==3.40.1
2
+ gradio_client==0.5.0
3
+ numpy==1.25.2
4
+ pandas==2.0.3
5
+ spacy
6
+ genbit
scripts/genbit_metrics.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from genbit.genbit_metrics import GenBitMetrics
2
+ import pandas as pd
3
+ from utils.read_config import get_args
4
+ from utils.load_csv import load_sample
5
+
6
+
7
+ def cal_metrics(dataset):
8
+ # Create a GenBit object with the desired settings:
9
+
10
+ genbit_metrics_object = GenBitMetrics(language_code="en", context_window=5, distance_weight=0.95, percentile_cutoff=80)
11
+
12
+ # Let's say you want to use GenBit with a test sentence, you can add the sentence to GenBit:
13
+ #dataset = ["I think she does not like cats. I think he does not like cats.", "He is a dog person."]
14
+
15
+ genbit_metrics_object.add_data(dataset, tokenized=False)
16
+
17
+
18
+ # To generate the gender bias metrics, we run `get_metrics` by setting `output_statistics` and `output_word_lists` to false, we can reduce the number of metrics created.
19
+ metrics = genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=True)
20
+
21
+ return metrics
22
+
23
+
24
+ # Function to extract genbit metrics
25
+ def extract_genbit_metris(stats):
26
+ metrics = {}
27
+ metrics["genbit_score"] = str(stats["genbit_score"])
28
+ metrics["percentage_of_female_gender_definition_words"] = str(stats["percentage_of_female_gender_definition_words"])
29
+ metrics["percentage_of_male_gender_definition_words"] = str(stats["percentage_of_male_gender_definition_words"])
30
+ metrics["percentage_of_non_binary_gender_definition_words"] = str(stats["percentage_of_non_binary_gender_definition_words"])
31
+ metrics["percentage_of_trans_gender_definition_words"] = str(stats["percentage_of_trans_gender_definition_words"])
32
+ metrics["percentage_of_cis_gender_definition_words"] = str(stats["percentage_of_cis_gender_definition_words"])
33
+ metrics["num_words_considered"] = str(stats["statistics"]["num_words_considered"])
34
+
35
+ return metrics
36
+
37
+ def load_dataset_and_get_genbit_metrics(df, sample_method, col_name, num_sample_records):
38
+
39
+
40
+ sample_df = load_sample(num_sample_records, sample_method, df, col_name)
41
+
42
+ # Turn into a list of text.
43
+ sample_text = sample_df[col_name].tolist()
44
+
45
+ # Call cal_metrics function
46
+ stats = cal_metrics(sample_text)
47
+ metrics = extract_genbit_metris(stats)
48
+ return metrics
scripts/gender_profession_tagging.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import spacy
4
+ from spacy.lang.en import English
5
+ import time
6
+ from tqdm import tqdm
7
+ import multiprocessing.pool
8
+
9
+ import warnings
10
+ warnings.filterwarnings("ignore")
11
+ from utils.read_config import get_args
12
+ from utils.load_csv import load_sample
13
+
14
+
15
+ # For sentence split
16
+ nlp = English()
17
+ nlp.add_pipe("sentencizer")
18
+
19
+ # Function to split sentences
20
+ def get_split_text(text):
21
+
22
+ doc = nlp(text)
23
+ sentences = [sent for sent in doc.sents]
24
+ return sentences
25
+
26
+ def get_gender_prof_match_details(df_text):
27
+
28
+ # Get args from config file
29
+ male_pronoun = get_args("male_pronoun")
30
+ female_pronoun = get_args("female_pronoun")
31
+ professions = get_args("professions")
32
+
33
+ # Get regex pattern
34
+ male_pronoun_pat, female_pronoun_pat, professions_pat = get_regex_pattern(male_pronoun, female_pronoun, professions)
35
+
36
+
37
+ split_text = get_split_text(df_text)
38
+
39
+ results = []
40
+
41
+ for text in split_text:
42
+ male_pronoun_match = re.findall(male_pronoun_pat, str(text))
43
+ female_pronoun_match = re.findall(female_pronoun_pat, str(text))
44
+
45
+ prof_match = re.findall(professions_pat, str(text))
46
+
47
+ both_match = "No"
48
+
49
+ if len(male_pronoun_match) != 0 and len(prof_match) != 0:
50
+ both_match = "Yes"
51
+
52
+ if len(female_pronoun_match) != 0 and len(prof_match) != 0:
53
+ both_match = "Yes"
54
+
55
+ # Unpack from list
56
+ male_pronoun_match = ",".join(male_pronoun_match)
57
+ female_pronoun_match = ",".join(female_pronoun_match)
58
+
59
+ prof_match = ",".join(prof_match)
60
+
61
+ results.append((str(text), male_pronoun_match, female_pronoun_match, prof_match, both_match))
62
+
63
+ return results
64
+
65
+ # Function to call multiprocessing threadpool
66
+ def call_multiprocessing_pool(df_text):
67
+ concurrent = 2000
68
+ pool = multiprocessing.pool.ThreadPool(processes=concurrent)
69
+ result_list = pool.map(get_gender_prof_match_details, df_text, chunksize=1)
70
+ pool.close()
71
+
72
+ # return_list is nested -- we need to flatten it
73
+ flat_return_list = [item for sublist in result_list for item in sublist]
74
+
75
+ # add column names
76
+ cols = ["Split_Text", 'Male Pronoun', 'Female Pronoun', 'Profession', "Both Match"]
77
+ return_df = pd.DataFrame(flat_return_list, columns=cols)
78
+
79
+ return return_df
80
+
81
+ # Function to get statistics
82
+ def get_statistics(results_df):
83
+ count_total_sentence = results_df.shape[0]
84
+ count_both_match = results_df[results_df["Both Match"] == "Yes"]['Both Match'].count()
85
+ count_male_pronoun = results_df[results_df["Male Pronoun"] != ""]["Male Pronoun"].count()
86
+ count_female_pronoun = results_df[results_df["Female Pronoun"] != ""]["Female Pronoun"].count()
87
+
88
+ count_male_pronoun_profession = results_df[(results_df["Male Pronoun"] != "") & (results_df["Profession"] != "")]["Male Pronoun"].count()
89
+ count_female_pronoun_profession = results_df[(results_df["Female Pronoun"] != "") & (results_df["Profession"] != "")]["Female Pronoun"].count()
90
+
91
+ return{
92
+ "total_sentence" : str(count_total_sentence),
93
+ "both_gender_prof_match" : str(count_both_match),
94
+ "count_male_pronoun" : str(count_male_pronoun),
95
+ "count_female_pronoun" : str(count_female_pronoun),
96
+ "count_male_pronoun_profession" : str(count_male_pronoun_profession),
97
+ "count_female_pronoun_profession" : str(count_female_pronoun_profession)
98
+ }
99
+
100
+ # Function to return regular expression patterns
101
+ def get_regex_pattern(male_pronoun, female_pronoun, professions):
102
+
103
+
104
+ male_pronoun_pat = r'\b({})\b'.format("|".join(male_pronoun))
105
+ female_pronoun_pat = r'\b({})\b'.format("|".join(female_pronoun))
106
+
107
+ #Lower case male professioon
108
+ professions = [prof.lower() for prof in professions]
109
+ professions_pat = r'\b({})\b'.format("|".join(professions))
110
+
111
+ return male_pronoun_pat, female_pronoun_pat, professions_pat
112
+
113
+
114
+ def load_dataset_and_analyze_gender_profession(df, sample_method, col_name, num_sample_records):
115
+ # Get args from config file
116
+
117
+ sample_df = load_sample(num_sample_records, sample_method, df, col_name)
118
+
119
+
120
+ # Lowercase of text
121
+ sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
122
+
123
+ # Call multiple threadpool
124
+ results_df = call_multiprocessing_pool(sample_df[col_name])
125
+
126
+ stats = get_statistics(results_df)
127
+
128
+ # Get statistics
129
+ return stats
scripts/gender_tagging.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import required libraries
2
+ import pandas as pd
3
+ import re
4
+ from utils.read_config import get_args
5
+ from utils.load_csv import load_sample
6
+
7
+ # Function to get count of male terms in text
8
+ def count_male_terms(text, male_terms):
9
+ # Get pattern
10
+ pattern = r"\b({})\b".format("|".join(male_terms))
11
+ match = re.findall(pattern, str(text))
12
+ return len(match)
13
+
14
+ # Function to get count of female terms in text
15
+ def count_female_terms(text, female_terms):
16
+ # Get pattern
17
+ pattern = r"\b({})\b".format("|".join(female_terms))
18
+ match = re.findall(pattern, str(text))
19
+ return len(match)
20
+
21
+ # Function to get gender tag categories
22
+ def get_gender_tag(count_m_term, count_f_term):
23
+ tag = ''
24
+ if count_m_term == 0 and count_f_term == 0:
25
+ tag = "No Gender"
26
+
27
+ elif count_m_term == count_f_term:
28
+ tag = "Equal Gender"
29
+
30
+ elif count_m_term > count_f_term:
31
+ m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
32
+ if m_proportion >= 50 and m_proportion < 75:
33
+ tag = "Male Positive Gender"
34
+ elif m_proportion >= 75:
35
+ tag = "Male Strongly Positive Gender"
36
+
37
+ elif count_m_term < count_f_term:
38
+ f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
39
+ if f_proportion >= 50 and f_proportion < 75:
40
+ tag = "Female Positive Gender"
41
+ elif f_proportion >= 75:
42
+ tag = "Female Strongly Positive Gender"
43
+
44
+ return tag
45
+
46
+
47
+ # Function to calculate PG and SPG
48
+ def get_pg_spg(sample_df):
49
+ count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"]['gender_cat'].count()
50
+
51
+ count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"]['gender_cat'].count()
52
+ count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"]['gender_cat'].count()
53
+
54
+ count_male_pg = sample_df[sample_df['gender_cat'] == "Male Positive Gender"]['gender_cat'].count()
55
+ count_male_spg = sample_df[sample_df['gender_cat'] == "Male Strongly Positive Gender"]['gender_cat'].count()
56
+
57
+ count_female_pg = sample_df[sample_df['gender_cat'] == "Female Positive Gender"]['gender_cat'].count()
58
+ count_female_spg = sample_df[sample_df['gender_cat'] == "Female Stronly Positive Gender"]['gender_cat'].count()
59
+
60
+ return {
61
+ "gender" : str(count_gender_sentences),
62
+ "no gender" : str(count_no_gender_sentences),
63
+ "equal gender" : str(count_equal_gender),
64
+ "female pg" : str(count_female_pg),
65
+ "male pg" : str(count_male_pg),
66
+ "female spg" : str(count_female_spg),
67
+ "male spg" : str(count_male_spg)
68
+ }
69
+
70
+ # Function to load dataset and get the analysis done
71
+ def load_dataset_and_analyze_gender_tag(df, sample_method, col_name, num_sample_records):
72
+ # Read config file
73
+ male_terms = get_args("male_terms")
74
+ female_terms = get_args("female_terms")
75
+ # Load sample
76
+ sample_df = load_sample(num_sample_records, sample_method, df, col_name)
77
+
78
+ # Lowercase of text
79
+ sample_df[col_name] = sample_df[col_name].str.lower().str.strip()
80
+
81
+ # Get new columns of count - male terms and female terms
82
+ sample_df['count_male_term'] = sample_df.apply(lambda x : count_male_terms(x[col_name], male_terms), axis=1)
83
+ sample_df['count_female_term'] = sample_df.apply(lambda x : count_female_terms(x[:], female_terms), axis=1)
84
+
85
+ # Get tag categories
86
+ sample_df['gender_cat'] = sample_df.apply(lambda row: get_gender_tag(row['count_male_term'], row['count_female_term']), axis=1)
87
+
88
+ # Get statistics
89
+ collection = get_pg_spg(sample_df)
90
+ return collection
91
+
92
+
93
+
utils/config.json ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "first_records" : 2000,
3
+ "random_seed" : 42,
4
+ "male_terms" : ["man", "boy", "male", "he", "son", "his", "himself", "guy", "father", "john"],
5
+ "female_terms" : ["woman", "girl", "female", "she", "daughter", "her", "herself", "gal", "mother", "mary"],
6
+ "male_pronoun" : ["he", "him", "his"],
7
+ "female_pronoun" : ["she", "her", "hers"],
8
+ "professions" : ["Accountant",
9
+ "Actor",
10
+ "Actress",
11
+ "Aerospace Engineer",
12
+ "Agricultural Scientist",
13
+ "Air Traffic Controller",
14
+ "Aircraft Mechanic",
15
+ "Animator",
16
+ "Architect",
17
+ "Art Director",
18
+ "Attorney",
19
+ "Lawyer",
20
+ "Audiologist",
21
+ "Author",
22
+ "Writer",
23
+ "Baker",
24
+ "Barber",
25
+ "Hairdresser",
26
+ "Bartender",
27
+ "Biomedical Engineer",
28
+ "Botanist",
29
+ "Broadcast Journalist",
30
+ "Business Analyst",
31
+ "Carpenter",
32
+ "Chef",
33
+ "Cook",
34
+ "Chemist",
35
+ "Civil Engineer",
36
+ "Clinical Psychologist",
37
+ "Commercial Diver",
38
+ "Computer Programmer",
39
+ "Construction Worker",
40
+ "Corporate Trainer",
41
+ "Cosmetologist",
42
+ "Counselor",
43
+ "Therapist",
44
+ "Court Reporter",
45
+ "Creative Director",
46
+ "Criminologist",
47
+ "Customer Service Representative",
48
+ "Data Analyst",
49
+ "Dental Assistant",
50
+ "Dentist",
51
+ "Dermatologist",
52
+ "Dietician",
53
+ "Nutritionist",
54
+ "Doctor",
55
+ "Physician",
56
+ "Economist",
57
+ "Electrician",
58
+ "Elementary School Teacher",
59
+ "Emergency Medical Technician",
60
+ "Engineer",
61
+ "Environmental Scientist",
62
+ "Event Planner",
63
+ "Fashion Designer",
64
+ "Film Director",
65
+ "Financial Analyst",
66
+ "Firefighter",
67
+ "Fisherman",
68
+ "Fitness Trainer",
69
+ "Flight Attendant",
70
+ "Florist",
71
+ "Food Scientist",
72
+ "Forensic Scientist",
73
+ "Furniture Maker",
74
+ "Game Developer",
75
+ "Gardener",
76
+ "Landscaper",
77
+ "Geologist",
78
+ "Graphic Designer",
79
+ "Hair Stylist",
80
+ "Historian",
81
+ "Home Health Aide",
82
+ "Hotel Manager",
83
+ "Human Resources Manager",
84
+ "Immigration Lawyer",
85
+ "Industrial Designer",
86
+ "Insurance Agent",
87
+ "Interior Designer",
88
+ "Interpreter",
89
+ "Translator",
90
+ "Investment Banker",
91
+ "IT Specialist",
92
+ "Journalist",
93
+ "Judge",
94
+ "Kindergarten Teacher",
95
+ "Land Surveyor",
96
+ "Landscape Architect",
97
+ "Lawyer",
98
+ "Attorney",
99
+ "Librarian",
100
+ "Life Coach",
101
+ "Linguist",
102
+ "Makeup Artist",
103
+ "Management Consultant",
104
+ "Manufacturing Engineer",
105
+ "Marine Biologist",
106
+ "Marketing Manager",
107
+ "Massage Therapist",
108
+ "Mechanical Engineer",
109
+ "Medical Assistant",
110
+ "Medical Researcher",
111
+ "Meteorologist",
112
+ "Midwife",
113
+ "Military Officer",
114
+ "Music Producer",
115
+ "Musician",
116
+ "Nurse",
117
+ "Occupational Therapist",
118
+ "Optician",
119
+ "Optometrist",
120
+ "Paralegal",
121
+ "Paramedic",
122
+ "Patent Attorney",
123
+ "Pediatrician",
124
+ "Personal Trainer",
125
+ "Petroleum Engineer",
126
+ "Pharmacist",
127
+ "Photographer",
128
+ "Physical Therapist",
129
+ "Physician Assistant",
130
+ "Pilot",
131
+ "Plumber",
132
+ "Police Officer",
133
+ "Political Scientist",
134
+ "Preschool Teacher",
135
+ "Private Investigator",
136
+ "Product Manager",
137
+ "Professor",
138
+ "Lecturer",
139
+ "Programmer",
140
+ "Psychiatrist",
141
+ "Psychologist",
142
+ "Public Relations Specialist",
143
+ "Public School Teacher",
144
+ "Real Estate Agent",
145
+ "Broker",
146
+ "Receptionist",
147
+ "Registered Nurse",
148
+ "Reporter",
149
+ "Restaurant Manager",
150
+ "Sales Representative",
151
+ "School Counselor",
152
+ "Scientist",
153
+ "Screenwriter",
154
+ "Social Media Manager",
155
+ "Social Worker",
156
+ "Software Developer",
157
+ "Speech-Language Pathologist",
158
+ "Sports Coach",
159
+ "Statistician"]
160
+ }
utils/load_csv.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from utils.read_config import get_args
3
+
4
+ # Function to load sample of dataset
5
+
6
+
7
+ def load_sample(num_sample_records, sample_method, df, col_name):
8
+
9
+ sample_first_records = get_args("first_records")
10
+ sample_random_seed = get_args("random_seed")
11
+
12
+ num_sample_records = num_sample_records if num_sample_records <= sample_first_records else sample_first_records
13
+
14
+ # Keep only required column
15
+ df = df[[col_name]]
16
+ if sample_method == "First":
17
+ df = df.iloc[:num_sample_records].copy().reset_index()
18
+ if sample_method == "Last":
19
+ df = df.iloc[-num_sample_records:].copy().reset_index()
20
+ if sample_method == "Random":
21
+ df = df.sample(num_sample_records,
22
+ random_state=sample_random_seed).copy().reset_index()
23
+ return df
utils/read_config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ def read_config_file():
4
+ with open("utils/config.json", "r") as jsonfile:
5
+ data = json.load(jsonfile)
6
+ return data
7
+
8
+ def get_args(args):
9
+ try:
10
+ data = read_config_file()
11
+ except:
12
+ raise "Could not read config file."
13
+ return data[args]