Spaces:
Runtime error
Runtime error
File size: 5,278 Bytes
9fa6437 7d2f336 b5dcb77 7d2f336 710b787 b962a46 b5dcb77 b962a46 7d2f336 9fa6437 a87bd77 7d2f336 710b787 7d2f336 9fec05e 7d2f336 9fec05e 7d2f336 9fec05e 7d2f336 9fec05e a87bd77 7d2f336 9fec05e 7d2f336 9fec05e 7d2f336 862cd29 7d2f336 7bc15eb 9fa6437 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import gradio as gr
import pypandoc
import glob
import shutil
import os
import tqdm
from huggingface_hub import snapshot_download
import tempfile
import re
import pdfminer
print("pdfminer", pdfminer.__version__)
print("pandoc", pypandoc.__version__)
#from docx import Document
#document = Document()
#document.add_heading('Labels for ', level=1)
RESULTS_FOLDER = "./results"
DOC_FOLDER = snapshot_download("claudiag/atlas", token=os.environ.get("HF_TOKEN"))
CAT_TO_CODEWORDS = {
"Prejudices": ["prejudice", "judge", "preconceive", "stigma", "assumption", "assume", "misunderstanding", "unexamined", "distorted", "clear", "compar"],
"Self-knowledge": ["self-knowledge", "self-awareness", "introspection", "examined", "myself", "realization", "belief"],
"Similarities": ["similarity", "same", "similar", "equal", "related", "together"],
"Diversity": ["diverse", "different", "diverse", "particular", "range", "multiplicity"],
"Business school": ["ESADE", "competitive", "business school", "education", "study", "university", "student", "consulting", "professional", "pressure", "performance", "institution"],
"Courage": ["courage", "brave", "dare", "step", "determine"],
"Change": ["change", "finally", "at last", "decided", "chose", "concluded", "want to", "swap", "different", "not the same", "replace", "convert", "trade", "future", "decision"],
"Coherence": ["coherent", "align", "incoherent", "consistent"],
"Voicing": ["speak", "express", "voice", "talk", "say", "open up", "articulate", "communicate", "convey", "reveal", "show", "verbalize", "phrase", "word"],
"Listening": ["listen", "pay attention", "quiet", "silence", "process", "hear", "attend"],
"Understanding": ["learn", "understand", "realize", "see", "believe", "question", "critical", "thought", "reasonable", "logical", "rational", "comprehensible", "accept"],
"Relationships": ["relationship", "relate", "bond", "connection", "bond", "others", "appreciate", "appreciation", "recognize", "recognition", "acknowledge"],
"Emotions": ["emotions", "felt", "feel", "a feeling of", "sense", "sensation", "instinct", "sentiment", "gut feeling", "intense", "wave"],
"The course": ["first time", "never", "always", "course", "elective", "Socratic Dialogue", "dialogue", "debate", "enroll", "arguments"],
}
CATEGORIES = CAT_TO_CODEWORDS.keys()
def retrieve_lines(filename):
extension = filename.split(".")[-1]
if extension == "pdf":
text = pdfminer.high_level.extract_text(filename)
lines = text.split("\n")
elif extension in ["docx", "doc"]:
with tempfile.TemporaryDirectory() as tmpdirname:
outfile = os.path.join(tmpdirname, "temp.txt")
pypandoc.convert_file(filename, 'plain', outputfile=outfile)
with open(outfile, "r") as f:
lines = f.readlines()
lines = [l.strip() for l in lines]
lines = " ".join(lines)
lines = lines.split(".")
return lines
def match_code(lines, codewords):
match_dict = {}
keywords_to_match = re.compile(fr'\b(?:{"|".join(codewords)})\b')
for i, _ in enumerate(lines):
line = lines[i]
matches = list(keywords_to_match.finditer(line))
if len(matches) > 0:
for m in matches:
span = m.span()
line = line[:span[0]] + line[span[0]:span[1]].upper() + line[span[1]:]
match_dict[i] = " ".join(line.rstrip().lstrip().split())
return match_dict
def main(filename, codewords_mapping):
lines = retrieve_lines(filename)
files = []
for label, codewords in codewords_mapping.items():
match = match_code(lines, codewords)
out = ""
if len(match) > 0:
result_file = ".".join(['_'.join(label.split()), "result", "txt"])
result_file = os.path.join(RESULTS_FOLDER, result_file)
if not os.path.exists(result_file):
out += f"# Code: {label}\n"
out += 25 * "="
out += "\n\n"
out += f"## Source: {filename}\n"
out += 25 * "-"
out += "\n"
out += "\n".join([f'-{v}' for k,v in match.items()])
out += "\n"
out += 25 * "-"
out += "\n\n"
with open(result_file, "a") as f:
f.write(out)
files.append(result_file)
return files
def convert(*keywords):
codewords_mapping = {k: v for k,v in zip(CATEGORIES, keywords)}
num_files = 0
shutil.rmtree(RESULTS_FOLDER, ignore_errors=True)
os.makedirs(RESULTS_FOLDER)
result_files = []
for folder in tqdm.tqdm(glob.glob(os.path.join(DOC_FOLDER, "/*"))):
all_files = tqdm.tqdm(glob.glob(f"./{folder}/*"))
num_files += len(all_files)
for filename in all_files:
try:
result_files += main(filename)
except Exception as e:
print(f"{filename} not working because \n {e}")
return f"Retrieved from {num_files}"
inputs = [gr.Textbox(label=f"Enter your keywords for {k}", max_lines=2, placeholder=CAT_TO_CODEWORDS[k]) for k in CATEGORIES]
iface = gr.Interface(
fn=convert, inputs=inputs, outputs="text")
iface.launch()
|