biasaware / scripts /gender_divide.py
freyam's picture
Restructure Logic and Data Flow
e0db39e
raw
history blame
3.11 kB
import re
import json
gender_lexicons = json.load(open("config/gender_lexicons.json", "r"))
def count_male_terms(text, male_terms):
pattern = r"\b({})\b".format("|".join(male_terms))
match = re.findall(pattern, str(text))
return len(match)
def count_female_terms(text, female_terms):
pattern = r"\b({})\b".format("|".join(female_terms))
match = re.findall(pattern, str(text))
return len(match)
def get_gender_tag(count_m_term, count_f_term):
tag = ""
if count_m_term == 0 and count_f_term == 0:
tag = "No Gender"
elif count_m_term == count_f_term:
tag = "Equal Gender"
elif count_m_term > count_f_term:
m_proportion = (count_m_term / (count_m_term + count_f_term)) * 100
if m_proportion >= 50 and m_proportion < 75:
tag = "Male Positive Gender"
elif m_proportion >= 75:
tag = "Male Strongly Positive Gender"
elif count_m_term < count_f_term:
f_proportion = (count_f_term / (count_m_term + count_f_term)) * 100
if f_proportion >= 50 and f_proportion < 75:
tag = "Female Positive Gender"
elif f_proportion >= 75:
tag = "Female Strongly Positive Gender"
return tag
def get_pg_spg(sample_df):
count_no_gender_sentences = sample_df[sample_df["gender_cat"] == "No Gender"][
"gender_cat"
].count()
count_gender_sentences = sample_df[sample_df["gender_cat"] != "No Gender"][
"gender_cat"
].count()
count_equal_gender = sample_df[sample_df["gender_cat"] == "Equal Gender"][
"gender_cat"
].count()
count_male_pg = sample_df[sample_df["gender_cat"] == "Male Positive Gender"][
"gender_cat"
].count()
count_male_spg = sample_df[
sample_df["gender_cat"] == "Male Strongly Positive Gender"
]["gender_cat"].count()
count_female_pg = sample_df[sample_df["gender_cat"] == "Female Positive Gender"][
"gender_cat"
].count()
count_female_spg = sample_df[
sample_df["gender_cat"] == "Female Stronly Positive Gender"
]["gender_cat"].count()
return {
"gender": str(count_gender_sentences),
"no gender": str(count_no_gender_sentences),
"equal gender": str(count_equal_gender),
"female pg": str(count_female_pg),
"male pg": str(count_male_pg),
"female spg": str(count_female_spg),
"male spg": str(count_male_spg),
}
def eval_gender_divide(data):
male_terms = gender_lexicons.get("male_lexicons")
female_terms = gender_lexicons.get("female_lexicons")
data[data.columns[0]] = data[data.columns[0]].str.lower().str.strip()
data["count_male_term"] = data.apply(
lambda x: count_male_terms(x[data.columns[0]], male_terms), axis=1
)
data["count_female_term"] = data.apply(
lambda x: count_female_terms(x[:], female_terms), axis=1
)
data["gender_cat"] = data.apply(
lambda row: get_gender_tag(row["count_male_term"], row["count_female_term"]),
axis=1,
)
collection = get_pg_spg(data)
return collection