Abhinav Gavireddi commited on
Commit
391f4fe
·
1 Parent(s): 8dd32aa

[fix]: fixed pdf parsing

Browse files
Files changed (2) hide show
  1. src/__init__.py +4 -2
  2. src/ghm.py +12 -6
src/__init__.py CHANGED
@@ -7,9 +7,13 @@ from sentence_transformers import SentenceTransformer
7
  import torch
8
  import chromadb
9
  from src.utils import OpenAIEmbedder, LocalEmbedder
 
10
 
11
  load_dotenv()
12
 
 
 
 
13
  def sanitize_html(raw):
14
  # allow only text and basic tags
15
  return bleach.clean(raw, tags=[], strip=True)
@@ -68,5 +72,3 @@ def get_embedder():
68
  else:
69
  logger.info(f"Using local embedder with model: {EmbeddingConfig.TEXT_MODEL}")
70
  return LocalEmbedder(model_name=EmbeddingConfig.TEXT_MODEL)
71
-
72
-
 
7
  import torch
8
  import chromadb
9
  from src.utils import OpenAIEmbedder, LocalEmbedder
10
+ from src.ghm import initialize_models
11
 
12
  load_dotenv()
13
 
14
+ # Initialize models and configurations at startup
15
+ initialize_models()
16
+
17
  def sanitize_html(raw):
18
  # allow only text and basic tags
19
  return bleach.clean(raw, tags=[], strip=True)
 
72
  else:
73
  logger.info(f"Using local embedder with model: {EmbeddingConfig.TEXT_MODEL}")
74
  return LocalEmbedder(model_name=EmbeddingConfig.TEXT_MODEL)
 
 
src/ghm.py CHANGED
@@ -1,10 +1,9 @@
1
  import json
2
  import os
3
 
 
4
  import requests
5
  from huggingface_hub import snapshot_download
6
- from utils import logger
7
-
8
 
9
  def download_json(url):
10
  response = requests.get(url)
@@ -28,13 +27,16 @@ def download_and_modify_json(url, local_filename, modifications):
28
  json.dump(data, f, ensure_ascii=False, indent=4)
29
 
30
 
31
- if __name__ == '__main__':
32
-
 
 
 
33
  mineru_patterns = [
34
  # "models/Layout/LayoutLMv3/*",
35
  "models/Layout/YOLO/*",
36
- # "models/MFD/YOLO/*",
37
- # "models/MFR/unimernet_hf_small_2503/*",
38
  "models/OCR/paddleocr_torch/*",
39
  # "models/TabRec/TableMaster/*",
40
  # "models/TabRec/StructEqTable/*",
@@ -69,3 +71,7 @@ if __name__ == '__main__':
69
 
70
  download_and_modify_json(json_url, config_file, json_mods)
71
  logger.info(f'The configuration file has been configured successfully, the path is: {config_file}')
 
 
 
 
 
1
  import json
2
  import os
3
 
4
+ from loguru import logger
5
  import requests
6
  from huggingface_hub import snapshot_download
 
 
7
 
8
  def download_json(url):
9
  response = requests.get(url)
 
27
  json.dump(data, f, ensure_ascii=False, indent=4)
28
 
29
 
30
+ def initialize_models():
31
+ """
32
+ Downloads and configures all necessary models and settings.
33
+ This function is designed to be called once at application startup.
34
+ """
35
  mineru_patterns = [
36
  # "models/Layout/LayoutLMv3/*",
37
  "models/Layout/YOLO/*",
38
+ "models/MFD/YOLO/*",
39
+ "models/MFR/unimernet_hf_small_2503/*",
40
  "models/OCR/paddleocr_torch/*",
41
  # "models/TabRec/TableMaster/*",
42
  # "models/TabRec/StructEqTable/*",
 
71
 
72
  download_and_modify_json(json_url, config_file, json_mods)
73
  logger.info(f'The configuration file has been configured successfully, the path is: {config_file}')
74
+
75
+
76
+ if __name__ == '__main__':
77
+ initialize_models()