Abhinav Gavireddi
commited on
Commit
·
391f4fe
1
Parent(s):
8dd32aa
[fix]: fixed pdf parsing
Browse files- src/__init__.py +4 -2
- src/ghm.py +12 -6
src/__init__.py
CHANGED
@@ -7,9 +7,13 @@ from sentence_transformers import SentenceTransformer
|
|
7 |
import torch
|
8 |
import chromadb
|
9 |
from src.utils import OpenAIEmbedder, LocalEmbedder
|
|
|
10 |
|
11 |
load_dotenv()
|
12 |
|
|
|
|
|
|
|
13 |
def sanitize_html(raw):
|
14 |
# allow only text and basic tags
|
15 |
return bleach.clean(raw, tags=[], strip=True)
|
@@ -68,5 +72,3 @@ def get_embedder():
|
|
68 |
else:
|
69 |
logger.info(f"Using local embedder with model: {EmbeddingConfig.TEXT_MODEL}")
|
70 |
return LocalEmbedder(model_name=EmbeddingConfig.TEXT_MODEL)
|
71 |
-
|
72 |
-
|
|
|
7 |
import torch
|
8 |
import chromadb
|
9 |
from src.utils import OpenAIEmbedder, LocalEmbedder
|
10 |
+
from src.ghm import initialize_models
|
11 |
|
12 |
load_dotenv()
|
13 |
|
14 |
+
# Initialize models and configurations at startup
|
15 |
+
initialize_models()
|
16 |
+
|
17 |
def sanitize_html(raw):
|
18 |
# allow only text and basic tags
|
19 |
return bleach.clean(raw, tags=[], strip=True)
|
|
|
72 |
else:
|
73 |
logger.info(f"Using local embedder with model: {EmbeddingConfig.TEXT_MODEL}")
|
74 |
return LocalEmbedder(model_name=EmbeddingConfig.TEXT_MODEL)
|
|
|
|
src/ghm.py
CHANGED
@@ -1,10 +1,9 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
|
|
|
4 |
import requests
|
5 |
from huggingface_hub import snapshot_download
|
6 |
-
from utils import logger
|
7 |
-
|
8 |
|
9 |
def download_json(url):
|
10 |
response = requests.get(url)
|
@@ -28,13 +27,16 @@ def download_and_modify_json(url, local_filename, modifications):
|
|
28 |
json.dump(data, f, ensure_ascii=False, indent=4)
|
29 |
|
30 |
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
33 |
mineru_patterns = [
|
34 |
# "models/Layout/LayoutLMv3/*",
|
35 |
"models/Layout/YOLO/*",
|
36 |
-
|
37 |
-
|
38 |
"models/OCR/paddleocr_torch/*",
|
39 |
# "models/TabRec/TableMaster/*",
|
40 |
# "models/TabRec/StructEqTable/*",
|
@@ -69,3 +71,7 @@ if __name__ == '__main__':
|
|
69 |
|
70 |
download_and_modify_json(json_url, config_file, json_mods)
|
71 |
logger.info(f'The configuration file has been configured successfully, the path is: {config_file}')
|
|
|
|
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
|
4 |
+
from loguru import logger
|
5 |
import requests
|
6 |
from huggingface_hub import snapshot_download
|
|
|
|
|
7 |
|
8 |
def download_json(url):
|
9 |
response = requests.get(url)
|
|
|
27 |
json.dump(data, f, ensure_ascii=False, indent=4)
|
28 |
|
29 |
|
30 |
+
def initialize_models():
|
31 |
+
"""
|
32 |
+
Downloads and configures all necessary models and settings.
|
33 |
+
This function is designed to be called once at application startup.
|
34 |
+
"""
|
35 |
mineru_patterns = [
|
36 |
# "models/Layout/LayoutLMv3/*",
|
37 |
"models/Layout/YOLO/*",
|
38 |
+
"models/MFD/YOLO/*",
|
39 |
+
"models/MFR/unimernet_hf_small_2503/*",
|
40 |
"models/OCR/paddleocr_torch/*",
|
41 |
# "models/TabRec/TableMaster/*",
|
42 |
# "models/TabRec/StructEqTable/*",
|
|
|
71 |
|
72 |
download_and_modify_json(json_url, config_file, json_mods)
|
73 |
logger.info(f'The configuration file has been configured successfully, the path is: {config_file}')
|
74 |
+
|
75 |
+
|
76 |
+
if __name__ == '__main__':
|
77 |
+
initialize_models()
|