jwlee-ai commited on
Commit
e9502c9
ยท
1 Parent(s): e1d661e
Files changed (6) hide show
  1. .gitignore +2 -0
  2. Dockerfile +7 -10
  3. app.py +13 -0
  4. download_model.py +21 -0
  5. embedding.py +56 -0
  6. requirements.txt +10 -2
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .venv_hf_space
2
+ ai_models
Dockerfile CHANGED
@@ -1,16 +1,13 @@
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
4
- FROM python:3.9
5
 
6
- RUN useradd -m -u 1000 user
7
- USER user
8
- ENV PATH="/home/user/.local/bin:$PATH"
9
 
10
- WORKDIR /app
11
 
12
- COPY --chown=user ./requirements.txt requirements.txt
13
- RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
-
15
- COPY --chown=user . /app
16
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
  # you will also find guides on how best to write your Dockerfile
3
 
4
+ FROM docker.elastic.co/elasticsearch/elasticsearch:9.0.0
5
 
6
+ ENV discovery.type=single-node
7
+ ENV xpack.security.enabled=false
8
+ ENV ES_JAVA_OPTS="-Xms512m -Xmx512m"
9
 
10
+ EXPOSE 9200 9300
11
 
12
+ HEALTHCHECK --interval=30s --timeout=30s --retries=3 \
13
+ CMD curl -f http://localhost:9200/_cluster/health || exit 1
 
 
 
app.py CHANGED
@@ -1,7 +1,20 @@
1
  from fastapi import FastAPI
 
2
 
3
  app = FastAPI()
4
 
5
  @app.get("/")
6
  def greet_json():
7
  return {"Hello": "World!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI
2
+ from embedding import PDFEmbedding
3
 
4
  app = FastAPI()
5
 
6
  @app.get("/")
7
  def greet_json():
8
  return {"Hello": "World!"}
9
+
10
+
11
+ # API ์—”๋“œํฌ์ธํŠธ๋กœ ์ด๋™๋จ
12
+ # app.py์˜ ๋‹ค์Œ ์ฝ”๋“œ ์ฐธ์กฐ:
13
+ @app.post("/process")
14
+ def process_pdfs():
15
+ pdf_embedding = PDFEmbedding(
16
+ model_path="../ai_models/hf/BGE-m3-ko",
17
+ pdf_dir="./data/pdf"
18
+ )
19
+ pdf_embedding.process_and_store()
20
+ return {"status": "success"}
download_model.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import snapshot_download
3
+
4
+ def download_bge_model():
5
+ # ๋ชจ๋ธ ์ €์žฅ ๊ฒฝ๋กœ ์„ค์ •
6
+ model_path = "./ai_models/hf/BGE-m3-ko"
7
+
8
+ # ๋””๋ ‰ํ† ๋ฆฌ๊ฐ€ ์—†์œผ๋ฉด ์ƒ์„ฑ
9
+ os.makedirs(model_path, exist_ok=True)
10
+
11
+ # ๋ชจ๋ธ ๋‹ค์šด๋กœ๋“œ
12
+ snapshot_download(
13
+ repo_id="dragonkue/BGE-m3-ko",
14
+ local_dir=model_path,
15
+ revision="main"
16
+ )
17
+
18
+ print(f"๋ชจ๋ธ์ด {model_path}์— ๋‹ค์šด๋กœ๋“œ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
19
+
20
+ if __name__ == "__main__":
21
+ download_bge_model()
embedding.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.embeddings import HuggingFaceEmbeddings
2
+ from langchain_community.document_loaders import PyPDFDirectoryLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain_community.vectorstores import ElasticsearchStore
5
+
6
+ class PDFEmbedding:
7
+ def __init__(self, model_path="dragonkue/BGE-m3-ko", pdf_dir="./data/pdf", es_url="http://localhost:9200", index_name="pdf_embeddings"):
8
+ self.embeddings = HuggingFaceEmbeddings(
9
+ model_name=model_path,
10
+ model_kwargs={'device': 'cuda:0'},
11
+ encode_kwargs={'normalize_embeddings': True}
12
+ )
13
+ self.pdf_dir = pdf_dir
14
+ self.es_url = es_url
15
+ self.index_name = index_name
16
+
17
+ def load_pdf_directory(self):
18
+ loader = PyPDFDirectoryLoader(self.pdf_dir)
19
+ pages = loader.load()
20
+
21
+ # ์ค„๋ฐ”๊ฟˆ ๋…ธ์ด์ฆˆ ์ •๋ฆฌ
22
+ for page in pages:
23
+ # ํ•˜์ดํ”ˆ์œผ๋กœ ์ค„๋ฐ”๊ฟˆ๋œ ๋‹จ์–ด ๋ณต์›
24
+ page.page_content = page.page_content.replace("-\n", "")
25
+ # ์ผ๋ฐ˜ ์ค„๋ฐ”๊ฟˆ์€ ๊ณต๋ฐฑ์œผ๋กœ ๋ณ€ํ™˜
26
+ page.page_content = page.page_content.replace("\n", " ")
27
+
28
+ return pages
29
+
30
+ def split_documents(self, documents):
31
+ text_splitter = RecursiveCharacterTextSplitter(
32
+ chunk_size=400,
33
+ chunk_overlap=50,
34
+ length_function=len,
35
+ separators=[r"\n{2,}", r"\n", r"[.!?]", r"[,;:]", r" "],
36
+ is_separator_regex=True
37
+ )
38
+ return text_splitter.split_documents(documents)
39
+
40
+ def process_and_store(self):
41
+ # PDF ๋กœ๋“œ
42
+ pdf_data = self.load_pdf_directory()
43
+
44
+ # ๋ฌธ์„œ ๋ถ„ํ• 
45
+ chunks = self.split_documents(pdf_data)
46
+
47
+ # Elasticsearch ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ƒ์„ฑ
48
+ vectorstore = ElasticsearchStore(
49
+ es_url=self.es_url,
50
+ index_name=self.index_name,
51
+ embedding=self.embeddings
52
+ )
53
+
54
+ # ๋ฌธ์„œ ์ €์žฅ
55
+ vectorstore.add_documents(chunks)
56
+
requirements.txt CHANGED
@@ -1,2 +1,10 @@
1
- fastapi
2
- uvicorn[standard]
 
 
 
 
 
 
 
 
 
1
+ huggingface-hub>=0.19.0
2
+ fastapi>=0.68.0
3
+ uvicorn>=0.15.0
4
+ langchain>=0.1.0
5
+ langchain-community>=0.0.10
6
+ elasticsearch>=8.0.0
7
+ pypdf>=3.0.0
8
+ torch>=2.0.0
9
+ transformers>=4.30.0
10
+ sentence-transformers>=2.2.0