test
Browse files- .gitignore +2 -0
- Dockerfile +7 -10
- app.py +13 -0
- download_model.py +21 -0
- embedding.py +56 -0
- requirements.txt +10 -2
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.venv_hf_space
|
2 |
+
ai_models
|
Dockerfile
CHANGED
@@ -1,16 +1,13 @@
|
|
1 |
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
# you will also find guides on how best to write your Dockerfile
|
3 |
|
4 |
-
FROM
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
ENV
|
9 |
|
10 |
-
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
COPY --chown=user . /app
|
16 |
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
1 |
# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
2 |
# you will also find guides on how best to write your Dockerfile
|
3 |
|
4 |
+
FROM docker.elastic.co/elasticsearch/elasticsearch:9.0.0
|
5 |
|
6 |
+
ENV discovery.type=single-node
|
7 |
+
ENV xpack.security.enabled=false
|
8 |
+
ENV ES_JAVA_OPTS="-Xms512m -Xmx512m"
|
9 |
|
10 |
+
EXPOSE 9200 9300
|
11 |
|
12 |
+
HEALTHCHECK --interval=30s --timeout=30s --retries=3 \
|
13 |
+
CMD curl -f http://localhost:9200/_cluster/health || exit 1
|
|
|
|
|
|
app.py
CHANGED
@@ -1,7 +1,20 @@
|
|
1 |
from fastapi import FastAPI
|
|
|
2 |
|
3 |
app = FastAPI()
|
4 |
|
5 |
@app.get("/")
|
6 |
def greet_json():
|
7 |
return {"Hello": "World!"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from fastapi import FastAPI
|
2 |
+
from embedding import PDFEmbedding
|
3 |
|
4 |
app = FastAPI()
|
5 |
|
6 |
@app.get("/")
|
7 |
def greet_json():
|
8 |
return {"Hello": "World!"}
|
9 |
+
|
10 |
+
|
11 |
+
# API ์๋ํฌ์ธํธ๋ก ์ด๋๋จ
|
12 |
+
# app.py์ ๋ค์ ์ฝ๋ ์ฐธ์กฐ:
|
13 |
+
@app.post("/process")
|
14 |
+
def process_pdfs():
|
15 |
+
pdf_embedding = PDFEmbedding(
|
16 |
+
model_path="../ai_models/hf/BGE-m3-ko",
|
17 |
+
pdf_dir="./data/pdf"
|
18 |
+
)
|
19 |
+
pdf_embedding.process_and_store()
|
20 |
+
return {"status": "success"}
|
download_model.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from huggingface_hub import snapshot_download
|
3 |
+
|
4 |
+
def download_bge_model():
|
5 |
+
# ๋ชจ๋ธ ์ ์ฅ ๊ฒฝ๋ก ์ค์
|
6 |
+
model_path = "./ai_models/hf/BGE-m3-ko"
|
7 |
+
|
8 |
+
# ๋๋ ํ ๋ฆฌ๊ฐ ์์ผ๋ฉด ์์ฑ
|
9 |
+
os.makedirs(model_path, exist_ok=True)
|
10 |
+
|
11 |
+
# ๋ชจ๋ธ ๋ค์ด๋ก๋
|
12 |
+
snapshot_download(
|
13 |
+
repo_id="dragonkue/BGE-m3-ko",
|
14 |
+
local_dir=model_path,
|
15 |
+
revision="main"
|
16 |
+
)
|
17 |
+
|
18 |
+
print(f"๋ชจ๋ธ์ด {model_path}์ ๋ค์ด๋ก๋๋์์ต๋๋ค.")
|
19 |
+
|
20 |
+
if __name__ == "__main__":
|
21 |
+
download_bge_model()
|
embedding.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
2 |
+
from langchain_community.document_loaders import PyPDFDirectoryLoader
|
3 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
4 |
+
from langchain_community.vectorstores import ElasticsearchStore
|
5 |
+
|
6 |
+
class PDFEmbedding:
|
7 |
+
def __init__(self, model_path="dragonkue/BGE-m3-ko", pdf_dir="./data/pdf", es_url="http://localhost:9200", index_name="pdf_embeddings"):
|
8 |
+
self.embeddings = HuggingFaceEmbeddings(
|
9 |
+
model_name=model_path,
|
10 |
+
model_kwargs={'device': 'cuda:0'},
|
11 |
+
encode_kwargs={'normalize_embeddings': True}
|
12 |
+
)
|
13 |
+
self.pdf_dir = pdf_dir
|
14 |
+
self.es_url = es_url
|
15 |
+
self.index_name = index_name
|
16 |
+
|
17 |
+
def load_pdf_directory(self):
|
18 |
+
loader = PyPDFDirectoryLoader(self.pdf_dir)
|
19 |
+
pages = loader.load()
|
20 |
+
|
21 |
+
# ์ค๋ฐ๊ฟ ๋
ธ์ด์ฆ ์ ๋ฆฌ
|
22 |
+
for page in pages:
|
23 |
+
# ํ์ดํ์ผ๋ก ์ค๋ฐ๊ฟ๋ ๋จ์ด ๋ณต์
|
24 |
+
page.page_content = page.page_content.replace("-\n", "")
|
25 |
+
# ์ผ๋ฐ ์ค๋ฐ๊ฟ์ ๊ณต๋ฐฑ์ผ๋ก ๋ณํ
|
26 |
+
page.page_content = page.page_content.replace("\n", " ")
|
27 |
+
|
28 |
+
return pages
|
29 |
+
|
30 |
+
def split_documents(self, documents):
|
31 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
32 |
+
chunk_size=400,
|
33 |
+
chunk_overlap=50,
|
34 |
+
length_function=len,
|
35 |
+
separators=[r"\n{2,}", r"\n", r"[.!?]", r"[,;:]", r" "],
|
36 |
+
is_separator_regex=True
|
37 |
+
)
|
38 |
+
return text_splitter.split_documents(documents)
|
39 |
+
|
40 |
+
def process_and_store(self):
|
41 |
+
# PDF ๋ก๋
|
42 |
+
pdf_data = self.load_pdf_directory()
|
43 |
+
|
44 |
+
# ๋ฌธ์ ๋ถํ
|
45 |
+
chunks = self.split_documents(pdf_data)
|
46 |
+
|
47 |
+
# Elasticsearch ๋ฒกํฐ ์คํ ์ด ์์ฑ
|
48 |
+
vectorstore = ElasticsearchStore(
|
49 |
+
es_url=self.es_url,
|
50 |
+
index_name=self.index_name,
|
51 |
+
embedding=self.embeddings
|
52 |
+
)
|
53 |
+
|
54 |
+
# ๋ฌธ์ ์ ์ฅ
|
55 |
+
vectorstore.add_documents(chunks)
|
56 |
+
|
requirements.txt
CHANGED
@@ -1,2 +1,10 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface-hub>=0.19.0
|
2 |
+
fastapi>=0.68.0
|
3 |
+
uvicorn>=0.15.0
|
4 |
+
langchain>=0.1.0
|
5 |
+
langchain-community>=0.0.10
|
6 |
+
elasticsearch>=8.0.0
|
7 |
+
pypdf>=3.0.0
|
8 |
+
torch>=2.0.0
|
9 |
+
transformers>=4.30.0
|
10 |
+
sentence-transformers>=2.2.0
|