hbertrand commited on
Commit
2642581
·
unverified ·
1 Parent(s): 177af2d

support for godot and lightning (#70)

Browse files
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.db filter=lfs diff=lfs merge=lfs -text
buster/data/{document_embeddings_huggingface.tar.gz → documents.db} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19453cb9ec85e644306af7dcc6fcad79cbb842d15a2087a66ddf48b5cbd9fbc9
3
- size 46918939
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b86c2b4f5a2ec410c2b9132ed62213528ba10c0dc260162f689e30ba677815f1
3
+ size 244338688
buster/docparser.py CHANGED
@@ -36,6 +36,16 @@ supported_docs = {
36
  "filename": "documents_huggingface.csv",
37
  "parser": HuggingfaceParser,
38
  },
 
 
 
 
 
 
 
 
 
 
39
  }
40
 
41
 
 
36
  "filename": "documents_huggingface.csv",
37
  "parser": HuggingfaceParser,
38
  },
39
+ "lightning": {
40
+ "base_url": "https://pytorch-lightning.readthedocs.io/en/stable/",
41
+ "filename": "documents_lightning.csv",
42
+ "parser": SphinxParser,
43
+ },
44
+ "godot": {
45
+ "base_url": "https://docs.godotengine.org/en/stable/",
46
+ "filename": "documents_godot.csv",
47
+ "parser": SphinxParser,
48
+ },
49
  }
50
 
51
 
buster/documents/sqlite/documents.py CHANGED
@@ -1,7 +1,5 @@
1
  import itertools
2
  import sqlite3
3
- import warnings
4
- import zlib
5
  from pathlib import Path
6
  from typing import Iterable, NamedTuple
7
 
@@ -41,7 +39,7 @@ class DocumentsDB(DocumentsManager):
41
  def __init__(self, db_path: sqlite3.Connection | str):
42
  if isinstance(db_path, (str, Path)):
43
  self.db_path = db_path
44
- self.conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES)
45
  else:
46
  self.db_path = None
47
  self.conn = db_path
@@ -143,6 +141,7 @@ class DocumentsDB(DocumentsManager):
143
 
144
  sid, vid = self.add_parse(source, (section for section, _ in sections))
145
  self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
 
146
 
147
  def get_documents(self, source: str) -> pd.DataFrame:
148
  """Get all current documents from a given source."""
 
1
  import itertools
2
  import sqlite3
 
 
3
  from pathlib import Path
4
  from typing import Iterable, NamedTuple
5
 
 
39
  def __init__(self, db_path: sqlite3.Connection | str):
40
  if isinstance(db_path, (str, Path)):
41
  self.db_path = db_path
42
+ self.conn = sqlite3.connect(db_path, detect_types=sqlite3.PARSE_DECLTYPES, check_same_thread=False)
43
  else:
44
  self.db_path = None
45
  self.conn = db_path
 
141
 
142
  sid, vid = self.add_parse(source, (section for section, _ in sections))
143
  self.add_chunking(sid, vid, size, (chunks for _, chunks in sections))
144
+ self.conn.commit()
145
 
146
  def get_documents(self, source: str) -> pd.DataFrame:
147
  """Get all current documents from a given source."""
buster/notebooks/db_to_csv.ipynb CHANGED
@@ -10,32 +10,41 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": null,
14
  "metadata": {},
15
  "outputs": [],
16
  "source": [
17
  "import os\n",
18
  "\n",
19
- "from buster.docparser import read_documents, write_documents"
20
  ]
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": null,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
28
  "# Path to the database\n",
29
  "db_path = \"documents.db\"\n",
 
30
  "\n",
31
  "# Source to extract\n",
32
- "target = \"pytorch\"\n",
33
- "df = read_documents(db_path, target)\n",
34
  "\n",
35
  "# If you want to save it as tar.gz\n",
36
  "filepath = os.path.join('buster/data/', f'document_embeddings_{target}.tar.gz')\n",
37
- "write_documents(filepath, target, df)"
 
38
  ]
 
 
 
 
 
 
 
39
  }
40
  ],
41
  "metadata": {
@@ -45,7 +54,15 @@
45
  "name": "python3"
46
  },
47
  "language_info": {
 
 
 
 
 
 
48
  "name": "python",
 
 
49
  "version": "3.10.9"
50
  },
51
  "orig_nbformat": 4,
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": 1,
14
  "metadata": {},
15
  "outputs": [],
16
  "source": [
17
  "import os\n",
18
  "\n",
19
+ "from buster.documents import get_documents_manager_from_extension"
20
  ]
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": 2,
25
  "metadata": {},
26
  "outputs": [],
27
  "source": [
28
  "# Path to the database\n",
29
  "db_path = \"documents.db\"\n",
30
+ "db = get_documents_manager_from_extension(db_path)(db_path)\n",
31
  "\n",
32
  "# Source to extract\n",
33
+ "target = \"lightning\"\n",
34
+ "df = db.get_documents(target)\n",
35
  "\n",
36
  "# If you want to save it as tar.gz\n",
37
  "filepath = os.path.join('buster/data/', f'document_embeddings_{target}.tar.gz')\n",
38
+ "db_df = get_documents_manager_from_extension(filepath)(filepath)\n",
39
+ "db_df.add(target, df)"
40
  ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": []
48
  }
49
  ],
50
  "metadata": {
 
54
  "name": "python3"
55
  },
56
  "language_info": {
57
+ "codemirror_mode": {
58
+ "name": "ipython",
59
+ "version": 3
60
+ },
61
+ "file_extension": ".py",
62
+ "mimetype": "text/x-python",
63
  "name": "python",
64
+ "nbconvert_exporter": "python",
65
+ "pygments_lexer": "ipython3",
66
  "version": "3.10.9"
67
  },
68
  "orig_nbformat": 4,