Spaces:
Running
Running
Abdullah Meda
commited on
Commit
·
f126864
1
Parent(s):
581dbdc
v1 complete
Browse files- .gitignore +2 -1
- app.py +58 -3
- make_docs.py +2 -1
- make_rag_db.py +62 -0
- postBuild +2 -0
- requirements.txt +5 -0
.gitignore
CHANGED
@@ -171,4 +171,5 @@ tags
|
|
171 |
# local
|
172 |
*.ipynb
|
173 |
docs/
|
174 |
-
repos/
|
|
|
|
171 |
# local
|
172 |
*.ipynb
|
173 |
docs/
|
174 |
+
repos/
|
175 |
+
milvus.db
|
app.py
CHANGED
@@ -3,8 +3,28 @@ import os
|
|
3 |
import json
|
4 |
import subprocess
|
5 |
import tempfile
|
|
|
6 |
import shutil
|
7 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def list_huggingface_resources_names() -> list[str]:
|
10 |
"""List all the names of the libraries, services, and other resources available within the HuggingFace ecosystem.
|
@@ -15,9 +35,38 @@ def list_huggingface_resources_names() -> list[str]:
|
|
15 |
with open('repos_config.json', 'r') as f:
|
16 |
repos = json.load(f)
|
17 |
|
|
|
|
|
18 |
return [repo['title'] for repo in repos]
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
list_resources_demo = gr.Interface(
|
22 |
fn=list_huggingface_resources_names,
|
23 |
inputs=[],
|
@@ -26,11 +75,17 @@ list_resources_demo = gr.Interface(
|
|
26 |
description="Explore the names of the libraries, services, and other resources available within the HuggingFace ecosystem"
|
27 |
)
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
# Create tabbed interface
|
30 |
demo = gr.TabbedInterface(
|
31 |
-
[list_resources_demo],
|
32 |
-
["List Resources"],
|
33 |
title="HuggingFace Ecosystem Documentation Explorer",
|
34 |
-
)
|
35 |
|
36 |
demo.launch(mcp_server=True)
|
|
|
3 |
import json
|
4 |
import subprocess
|
5 |
import tempfile
|
6 |
+
import dotenv
|
7 |
import shutil
|
8 |
from pathlib import Path
|
9 |
+
from string import Template
|
10 |
+
from pymilvus import MilvusClient, model
|
11 |
+
|
12 |
+
assert dotenv.load_dotenv()
|
13 |
+
|
14 |
+
template = Template("""\
|
15 |
+
---
|
16 |
+
File: $file_path
|
17 |
+
---
|
18 |
+
|
19 |
+
$file_content""")
|
20 |
+
|
21 |
+
client = MilvusClient("milvus.db")
|
22 |
+
embedding_fn = model.dense.OpenAIEmbeddingFunction(
|
23 |
+
model_name='text-embedding-3-small', # Specify the model name
|
24 |
+
api_key=os.environ.get('OPENAI_API_KEY'), # Provide your OpenAI API key
|
25 |
+
dimensions=1536 # Set the embedding dimensionality
|
26 |
+
)
|
27 |
+
|
28 |
|
29 |
def list_huggingface_resources_names() -> list[str]:
|
30 |
"""List all the names of the libraries, services, and other resources available within the HuggingFace ecosystem.
|
|
|
35 |
with open('repos_config.json', 'r') as f:
|
36 |
repos = json.load(f)
|
37 |
|
38 |
+
print([repo['title'] for repo in repos])
|
39 |
+
|
40 |
return [repo['title'] for repo in repos]
|
41 |
|
42 |
|
43 |
+
def get_huggingface_documentation(topic: str, resource_names: list[str]) -> str:
|
44 |
+
"""Get the documentation for the given topic and resource names.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
topic: Focus the docs on a specific topic (e.g. "Anthropic Provider Chat UI", "LoRA methods PEFT" or "TGI on Intel GPUs")
|
48 |
+
resource_names: A list of relevant resource names to the topic
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
A string of documentation for the given topic and resource names
|
52 |
+
"""
|
53 |
+
print(resource_names)
|
54 |
+
query_vectors = embedding_fn.encode_queries([topic])
|
55 |
+
res = client.search(collection_name="hf_docs", data=query_vectors, limit=3, output_fields=["text", "file_path"])
|
56 |
+
print(res)
|
57 |
+
|
58 |
+
docs_paths = [res[0][i]['file_path'] for i in range(len(res[0]))]
|
59 |
+
print(docs_paths)
|
60 |
+
|
61 |
+
documentation = ""
|
62 |
+
for path in docs_paths:
|
63 |
+
with open(path, 'r') as f:
|
64 |
+
content = f.read()
|
65 |
+
documentation += template.substitute(file_path=path.replace('docs/', ''), file_content=content) + "\n\n"
|
66 |
+
|
67 |
+
print(documentation.strip())
|
68 |
+
return documentation.strip()
|
69 |
+
|
70 |
list_resources_demo = gr.Interface(
|
71 |
fn=list_huggingface_resources_names,
|
72 |
inputs=[],
|
|
|
75 |
description="Explore the names of the libraries, services, and other resources available within the HuggingFace ecosystem"
|
76 |
)
|
77 |
|
78 |
+
get_docs_demo = gr.Interface(
|
79 |
+
fn=get_huggingface_documentation,
|
80 |
+
inputs=["text", "json"],
|
81 |
+
outputs="text",
|
82 |
+
)
|
83 |
+
|
84 |
# Create tabbed interface
|
85 |
demo = gr.TabbedInterface(
|
86 |
+
[list_resources_demo, get_docs_demo],
|
87 |
+
["List Resources", "Get Documentation"],
|
88 |
title="HuggingFace Ecosystem Documentation Explorer",
|
89 |
+
)
|
90 |
|
91 |
demo.launch(mcp_server=True)
|
make_docs.py
CHANGED
@@ -93,7 +93,7 @@ def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path):
|
|
93 |
shutil.copy(local_path, file_path / f"{title}{local_path.suffix}")
|
94 |
|
95 |
except Exception as e:
|
96 |
-
# TODO:
|
97 |
pass
|
98 |
|
99 |
|
@@ -128,4 +128,5 @@ if __name__ == "__main__":
|
|
128 |
with open("repos_config.json", "r") as f:
|
129 |
repos = json.load(f)
|
130 |
|
|
|
131 |
make_docs(repos, args)
|
|
|
93 |
shutil.copy(local_path, file_path / f"{title}{local_path.suffix}")
|
94 |
|
95 |
except Exception as e:
|
96 |
+
# TODO: Not many cases, but handle symlinks, missing files, and other edge cases
|
97 |
pass
|
98 |
|
99 |
|
|
|
128 |
with open("repos_config.json", "r") as f:
|
129 |
repos = json.load(f)
|
130 |
|
131 |
+
shutil.rmtree(args.docs_dir)
|
132 |
make_docs(repos, args)
|
make_rag_db.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import argparse
|
3 |
+
from typing import Dict
|
4 |
+
import dotenv
|
5 |
+
from pathlib import Path
|
6 |
+
from tqdm import tqdm
|
7 |
+
from pymilvus import MilvusClient, model
|
8 |
+
|
9 |
+
assert dotenv.load_dotenv()
|
10 |
+
|
11 |
+
|
12 |
+
def create_collection(client: MilvusClient, collection_name: str, dimension: int):
|
13 |
+
if client.has_collection(collection_name=collection_name):
|
14 |
+
client.drop_collection(collection_name=collection_name)
|
15 |
+
|
16 |
+
client.create_collection(
|
17 |
+
collection_name=collection_name,
|
18 |
+
dimension=dimension,
|
19 |
+
)
|
20 |
+
|
21 |
+
def main(args: Dict):
|
22 |
+
client = MilvusClient("milvus.db")
|
23 |
+
|
24 |
+
embedding_fn = model.dense.OpenAIEmbeddingFunction(
|
25 |
+
model_name=args.model_name,
|
26 |
+
api_key=os.environ.get('OPENAI_API_KEY'),
|
27 |
+
dimensions=args.dimension
|
28 |
+
)
|
29 |
+
|
30 |
+
create_collection(client, args.collection_name, args.dimension)
|
31 |
+
|
32 |
+
docs = Path(args.docs_dir)
|
33 |
+
md_file_paths = list(docs.rglob('*.md'))
|
34 |
+
mdx_file_paths = list(docs.rglob('*.mdx'))
|
35 |
+
all_file_paths = md_file_paths + mdx_file_paths
|
36 |
+
|
37 |
+
docs, payloads = [], []
|
38 |
+
for file in tqdm(all_file_paths):
|
39 |
+
embed_string = str(file).replace('docs/', '').replace('.mdx', '').replace('.md', '').replace('/', ' ')
|
40 |
+
|
41 |
+
docs.append(embed_string)
|
42 |
+
payloads.append({'file_path': str(file)})
|
43 |
+
|
44 |
+
vectors = embedding_fn.encode_documents(docs)
|
45 |
+
|
46 |
+
data = [
|
47 |
+
{"id": i, "vector": vectors[i], "text": docs[i], **payloads[i]}
|
48 |
+
for i in range(len(vectors))
|
49 |
+
]
|
50 |
+
|
51 |
+
response = client.insert(collection_name=args.collection_name, data=data)
|
52 |
+
print(f"Inserted {response['insert_count']} vectors into collection {args.collection_name}")
|
53 |
+
|
54 |
+
if __name__ == "__main__":
|
55 |
+
parser = argparse.ArgumentParser()
|
56 |
+
parser.add_argument("--collection_name", type=str, default="hf_docs")
|
57 |
+
parser.add_argument("--model_name", type=str, default="text-embedding-3-small")
|
58 |
+
parser.add_argument("--dimension", type=int, default=1536)
|
59 |
+
parser.add_argument("--docs_dir", type=str, default="docs")
|
60 |
+
args = parser.parse_args()
|
61 |
+
|
62 |
+
main(args)
|
postBuild
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
python3 make_docs.py
|
2 |
+
python3 make_rag_db.py
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pymilvus==2.5.10
|
2 |
+
pymilvus_model==0.3.2
|
3 |
+
python-dotenv==1.1.0
|
4 |
+
PyYAML==6.0.2
|
5 |
+
tqdm==4.65.0
|