goberoi commited on
Commit
d1923ad
·
verified ·
1 Parent(s): 8774b52

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +22 -0
main.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chardet
2
+ import spacy
3
+ from gliner_spacy import GLiNERComponent
4
+
5
+ # Load SpaCy and add GLiNER to the pipeline
6
+ nlp = spacy.load("en_core_web_lg")
7
+ nlp.add_pipe("gliner_spacy", config={
8
+ "labels": ["PERSON", "ORGANIZATION", "LOCATION", "DISEASE"],
9
+ "model": "urchade/gliner_multi_pii-v1"
10
+ }, last=True)
11
+
12
+ def detect_encoding(file_bytes):
13
+ result = chardet.detect(file_bytes)
14
+ return result.get('encoding', 'utf-8')
15
+
16
+ def extract_entities_from_file(file):
17
+ file_bytes = file.read()
18
+ encoding = detect_encoding(file_bytes)
19
+ text = file_bytes.decode(encoding, errors='ignore')
20
+ doc = nlp(text)
21
+ results = [(ent.text, ent.label_) for ent in doc.ents]
22
+ return results