Abdullah Meda commited on
Commit
f126864
·
1 Parent(s): 581dbdc

v1 complete

Browse files
Files changed (6) hide show
  1. .gitignore +2 -1
  2. app.py +58 -3
  3. make_docs.py +2 -1
  4. make_rag_db.py +62 -0
  5. postBuild +2 -0
  6. requirements.txt +5 -0
.gitignore CHANGED
@@ -171,4 +171,5 @@ tags
171
  # local
172
  *.ipynb
173
  docs/
174
- repos/
 
 
171
  # local
172
  *.ipynb
173
  docs/
174
+ repos/
175
+ milvus.db
app.py CHANGED
@@ -3,8 +3,28 @@ import os
3
  import json
4
  import subprocess
5
  import tempfile
 
6
  import shutil
7
  from pathlib import Path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def list_huggingface_resources_names() -> list[str]:
10
  """List all the names of the libraries, services, and other resources available within the HuggingFace ecosystem.
@@ -15,9 +35,38 @@ def list_huggingface_resources_names() -> list[str]:
15
  with open('repos_config.json', 'r') as f:
16
  repos = json.load(f)
17
 
 
 
18
  return [repo['title'] for repo in repos]
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  list_resources_demo = gr.Interface(
22
  fn=list_huggingface_resources_names,
23
  inputs=[],
@@ -26,11 +75,17 @@ list_resources_demo = gr.Interface(
26
  description="Explore the names of the libraries, services, and other resources available within the HuggingFace ecosystem"
27
  )
28
 
 
 
 
 
 
 
29
  # Create tabbed interface
30
  demo = gr.TabbedInterface(
31
- [list_resources_demo],
32
- ["List Resources"],
33
  title="HuggingFace Ecosystem Documentation Explorer",
34
- )
35
 
36
  demo.launch(mcp_server=True)
 
3
  import json
4
  import subprocess
5
  import tempfile
6
+ import dotenv
7
  import shutil
8
  from pathlib import Path
9
+ from string import Template
10
+ from pymilvus import MilvusClient, model
11
+
12
+ assert dotenv.load_dotenv()
13
+
14
+ template = Template("""\
15
+ ---
16
+ File: $file_path
17
+ ---
18
+
19
+ $file_content""")
20
+
21
+ client = MilvusClient("milvus.db")
22
+ embedding_fn = model.dense.OpenAIEmbeddingFunction(
23
+ model_name='text-embedding-3-small', # Specify the model name
24
+ api_key=os.environ.get('OPENAI_API_KEY'), # Provide your OpenAI API key
25
+ dimensions=1536 # Set the embedding dimensionality
26
+ )
27
+
28
 
29
  def list_huggingface_resources_names() -> list[str]:
30
  """List all the names of the libraries, services, and other resources available within the HuggingFace ecosystem.
 
35
  with open('repos_config.json', 'r') as f:
36
  repos = json.load(f)
37
 
38
+ print([repo['title'] for repo in repos])
39
+
40
  return [repo['title'] for repo in repos]
41
 
42
 
43
+ def get_huggingface_documentation(topic: str, resource_names: list[str]) -> str:
44
+ """Get the documentation for the given topic and resource names.
45
+
46
+ Args:
47
+ topic: Focus the docs on a specific topic (e.g. "Anthropic Provider Chat UI", "LoRA methods PEFT" or "TGI on Intel GPUs")
48
+ resource_names: A list of relevant resource names to the topic
49
+
50
+ Returns:
51
+ A string of documentation for the given topic and resource names
52
+ """
53
+ print(resource_names)
54
+ query_vectors = embedding_fn.encode_queries([topic])
55
+ res = client.search(collection_name="hf_docs", data=query_vectors, limit=3, output_fields=["text", "file_path"])
56
+ print(res)
57
+
58
+ docs_paths = [res[0][i]['file_path'] for i in range(len(res[0]))]
59
+ print(docs_paths)
60
+
61
+ documentation = ""
62
+ for path in docs_paths:
63
+ with open(path, 'r') as f:
64
+ content = f.read()
65
+ documentation += template.substitute(file_path=path.replace('docs/', ''), file_content=content) + "\n\n"
66
+
67
+ print(documentation.strip())
68
+ return documentation.strip()
69
+
70
  list_resources_demo = gr.Interface(
71
  fn=list_huggingface_resources_names,
72
  inputs=[],
 
75
  description="Explore the names of the libraries, services, and other resources available within the HuggingFace ecosystem"
76
  )
77
 
78
+ get_docs_demo = gr.Interface(
79
+ fn=get_huggingface_documentation,
80
+ inputs=["text", "json"],
81
+ outputs="text",
82
+ )
83
+
84
  # Create tabbed interface
85
  demo = gr.TabbedInterface(
86
+ [list_resources_demo, get_docs_demo],
87
+ ["List Resources", "Get Documentation"],
88
  title="HuggingFace Ecosystem Documentation Explorer",
89
+ )
90
 
91
  demo.launch(mcp_server=True)
make_docs.py CHANGED
@@ -93,7 +93,7 @@ def save_section_to_disk(section: Dict, file_path: Path, raw_docs_path: Path):
93
  shutil.copy(local_path, file_path / f"{title}{local_path.suffix}")
94
 
95
  except Exception as e:
96
- # TODO: Handle symlinks, missing files, and other edge cases
97
  pass
98
 
99
 
@@ -128,4 +128,5 @@ if __name__ == "__main__":
128
  with open("repos_config.json", "r") as f:
129
  repos = json.load(f)
130
 
 
131
  make_docs(repos, args)
 
93
  shutil.copy(local_path, file_path / f"{title}{local_path.suffix}")
94
 
95
  except Exception as e:
96
+ # TODO: Not many cases, but handle symlinks, missing files, and other edge cases
97
  pass
98
 
99
 
 
128
  with open("repos_config.json", "r") as f:
129
  repos = json.load(f)
130
 
131
+ shutil.rmtree(args.docs_dir)
132
  make_docs(repos, args)
make_rag_db.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from typing import Dict
4
+ import dotenv
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+ from pymilvus import MilvusClient, model
8
+
9
+ assert dotenv.load_dotenv()
10
+
11
+
12
+ def create_collection(client: MilvusClient, collection_name: str, dimension: int):
13
+ if client.has_collection(collection_name=collection_name):
14
+ client.drop_collection(collection_name=collection_name)
15
+
16
+ client.create_collection(
17
+ collection_name=collection_name,
18
+ dimension=dimension,
19
+ )
20
+
21
+ def main(args: Dict):
22
+ client = MilvusClient("milvus.db")
23
+
24
+ embedding_fn = model.dense.OpenAIEmbeddingFunction(
25
+ model_name=args.model_name,
26
+ api_key=os.environ.get('OPENAI_API_KEY'),
27
+ dimensions=args.dimension
28
+ )
29
+
30
+ create_collection(client, args.collection_name, args.dimension)
31
+
32
+ docs = Path(args.docs_dir)
33
+ md_file_paths = list(docs.rglob('*.md'))
34
+ mdx_file_paths = list(docs.rglob('*.mdx'))
35
+ all_file_paths = md_file_paths + mdx_file_paths
36
+
37
+ docs, payloads = [], []
38
+ for file in tqdm(all_file_paths):
39
+ embed_string = str(file).replace('docs/', '').replace('.mdx', '').replace('.md', '').replace('/', ' ')
40
+
41
+ docs.append(embed_string)
42
+ payloads.append({'file_path': str(file)})
43
+
44
+ vectors = embedding_fn.encode_documents(docs)
45
+
46
+ data = [
47
+ {"id": i, "vector": vectors[i], "text": docs[i], **payloads[i]}
48
+ for i in range(len(vectors))
49
+ ]
50
+
51
+ response = client.insert(collection_name=args.collection_name, data=data)
52
+ print(f"Inserted {response['insert_count']} vectors into collection {args.collection_name}")
53
+
54
+ if __name__ == "__main__":
55
+ parser = argparse.ArgumentParser()
56
+ parser.add_argument("--collection_name", type=str, default="hf_docs")
57
+ parser.add_argument("--model_name", type=str, default="text-embedding-3-small")
58
+ parser.add_argument("--dimension", type=int, default=1536)
59
+ parser.add_argument("--docs_dir", type=str, default="docs")
60
+ args = parser.parse_args()
61
+
62
+ main(args)
postBuild ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python3 make_docs.py
2
+ python3 make_rag_db.py
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pymilvus==2.5.10
2
+ pymilvus_model==0.3.2
3
+ python-dotenv==1.1.0
4
+ PyYAML==6.0.2
5
+ tqdm==4.65.0