Spaces:
Running
Running
updated requirements
Browse files- .idea/code-chunker.iml +1 -0
- Chunker.py +15 -15
- app.py +19 -1
- requirements.txt +45 -0
.idea/code-chunker.iml
CHANGED
|
@@ -10,5 +10,6 @@
|
|
| 10 |
</component>
|
| 11 |
<component name="PackageRequirementsSettings">
|
| 12 |
<option name="removeUnused" value="true" />
|
|
|
|
| 13 |
</component>
|
| 14 |
</module>
|
|
|
|
| 10 |
</component>
|
| 11 |
<component name="PackageRequirementsSettings">
|
| 12 |
<option name="removeUnused" value="true" />
|
| 13 |
+
<option name="modifyBaseFiles" value="true" />
|
| 14 |
</component>
|
| 15 |
</module>
|
Chunker.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
from abc import ABC, abstractmethod
|
| 2 |
from CodeParser import CodeParser
|
| 3 |
-
from
|
| 4 |
-
|
| 5 |
|
| 6 |
|
| 7 |
class Chunker(ABC):
|
|
@@ -20,19 +19,20 @@ class Chunker(ABC):
|
|
| 20 |
def print_chunks(chunks):
|
| 21 |
for chunk_number, chunk_code in chunks.items():
|
| 22 |
print(f"Chunk {chunk_number}:")
|
| 23 |
-
print("="*40)
|
| 24 |
print(chunk_code)
|
| 25 |
-
print("="*40)
|
| 26 |
|
| 27 |
@staticmethod
|
| 28 |
def consolidate_chunks_into_file(chunks):
|
| 29 |
return "\n".join(chunks.values())
|
| 30 |
-
|
| 31 |
@staticmethod
|
| 32 |
def count_lines(consolidated_chunks):
|
| 33 |
lines = consolidated_chunks.split("\n")
|
| 34 |
return len(lines)
|
| 35 |
|
|
|
|
| 36 |
class CodeChunker(Chunker):
|
| 37 |
def __init__(self, file_extension, encoding_name="gpt-4"):
|
| 38 |
super().__init__(encoding_name)
|
|
@@ -60,15 +60,16 @@ class CodeChunker(Chunker):
|
|
| 60 |
if highest_comment_line: # If a highest comment line exists, add it
|
| 61 |
adjusted_breakpoints.append(highest_comment_line)
|
| 62 |
else:
|
| 63 |
-
adjusted_breakpoints.append(
|
|
|
|
| 64 |
|
| 65 |
breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
|
| 66 |
-
|
| 67 |
while i < len(lines):
|
| 68 |
line = lines[i]
|
| 69 |
new_token_count = count_tokens(line, self.encoding_name)
|
| 70 |
if token_count + new_token_count > token_limit:
|
| 71 |
-
|
| 72 |
# Set the stop line to the last breakpoint before the current line
|
| 73 |
if i in breakpoints:
|
| 74 |
stop_line = i
|
|
@@ -79,20 +80,20 @@ class CodeChunker(Chunker):
|
|
| 79 |
if stop_line == start_line and i not in breakpoints:
|
| 80 |
token_count += new_token_count
|
| 81 |
i += 1
|
| 82 |
-
|
| 83 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
| 84 |
elif stop_line == start_line and i == stop_line:
|
| 85 |
token_count += new_token_count
|
| 86 |
i += 1
|
| 87 |
-
|
| 88 |
-
|
| 89 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
| 90 |
elif stop_line == start_line and i in breakpoints:
|
| 91 |
current_chunk = "\n".join(lines[start_line:stop_line])
|
| 92 |
if current_chunk.strip(): # If the current chunk is not just whitespace
|
| 93 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
| 94 |
chunk_number += 1
|
| 95 |
-
|
| 96 |
token_count = 0
|
| 97 |
start_line = i
|
| 98 |
i += 1
|
|
@@ -103,7 +104,7 @@ class CodeChunker(Chunker):
|
|
| 103 |
if current_chunk.strip():
|
| 104 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
| 105 |
chunk_number += 1
|
| 106 |
-
|
| 107 |
i = stop_line
|
| 108 |
token_count = 0
|
| 109 |
start_line = stop_line
|
|
@@ -116,9 +117,8 @@ class CodeChunker(Chunker):
|
|
| 116 |
current_chunk_code = "\n".join(lines[start_line:])
|
| 117 |
if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
|
| 118 |
chunks[chunk_number] = current_chunk_code # Using chunk_number as key
|
| 119 |
-
|
| 120 |
return chunks
|
| 121 |
|
| 122 |
def get_chunk(self, chunked_codebase, chunk_number):
|
| 123 |
return chunked_codebase[chunk_number]
|
| 124 |
-
|
|
|
|
| 1 |
from abc import ABC, abstractmethod
|
| 2 |
from CodeParser import CodeParser
|
| 3 |
+
from utils import count_tokens
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
class Chunker(ABC):
|
|
|
|
| 19 |
def print_chunks(chunks):
|
| 20 |
for chunk_number, chunk_code in chunks.items():
|
| 21 |
print(f"Chunk {chunk_number}:")
|
| 22 |
+
print("=" * 40)
|
| 23 |
print(chunk_code)
|
| 24 |
+
print("=" * 40)
|
| 25 |
|
| 26 |
@staticmethod
|
| 27 |
def consolidate_chunks_into_file(chunks):
|
| 28 |
return "\n".join(chunks.values())
|
| 29 |
+
|
| 30 |
@staticmethod
|
| 31 |
def count_lines(consolidated_chunks):
|
| 32 |
lines = consolidated_chunks.split("\n")
|
| 33 |
return len(lines)
|
| 34 |
|
| 35 |
+
|
| 36 |
class CodeChunker(Chunker):
|
| 37 |
def __init__(self, file_extension, encoding_name="gpt-4"):
|
| 38 |
super().__init__(encoding_name)
|
|
|
|
| 60 |
if highest_comment_line: # If a highest comment line exists, add it
|
| 61 |
adjusted_breakpoints.append(highest_comment_line)
|
| 62 |
else:
|
| 63 |
+
adjusted_breakpoints.append(
|
| 64 |
+
bp) # If no comments were found before the breakpoint, add the original breakpoint
|
| 65 |
|
| 66 |
breakpoints = sorted(set(adjusted_breakpoints)) # Ensure breakpoints are unique and sorted
|
| 67 |
+
|
| 68 |
while i < len(lines):
|
| 69 |
line = lines[i]
|
| 70 |
new_token_count = count_tokens(line, self.encoding_name)
|
| 71 |
if token_count + new_token_count > token_limit:
|
| 72 |
+
|
| 73 |
# Set the stop line to the last breakpoint before the current line
|
| 74 |
if i in breakpoints:
|
| 75 |
stop_line = i
|
|
|
|
| 80 |
if stop_line == start_line and i not in breakpoints:
|
| 81 |
token_count += new_token_count
|
| 82 |
i += 1
|
| 83 |
+
|
| 84 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
| 85 |
elif stop_line == start_line and i == stop_line:
|
| 86 |
token_count += new_token_count
|
| 87 |
i += 1
|
| 88 |
+
|
| 89 |
+
|
| 90 |
# If the stop line is the same as the start line and the current line is a breakpoint, it means we can create a chunk with just the current line
|
| 91 |
elif stop_line == start_line and i in breakpoints:
|
| 92 |
current_chunk = "\n".join(lines[start_line:stop_line])
|
| 93 |
if current_chunk.strip(): # If the current chunk is not just whitespace
|
| 94 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
| 95 |
chunk_number += 1
|
| 96 |
+
|
| 97 |
token_count = 0
|
| 98 |
start_line = i
|
| 99 |
i += 1
|
|
|
|
| 104 |
if current_chunk.strip():
|
| 105 |
chunks[chunk_number] = current_chunk # Using chunk_number as key
|
| 106 |
chunk_number += 1
|
| 107 |
+
|
| 108 |
i = stop_line
|
| 109 |
token_count = 0
|
| 110 |
start_line = stop_line
|
|
|
|
| 117 |
current_chunk_code = "\n".join(lines[start_line:])
|
| 118 |
if current_chunk_code.strip(): # Checks if the chunk is not just whitespace
|
| 119 |
chunks[chunk_number] = current_chunk_code # Using chunk_number as key
|
| 120 |
+
|
| 121 |
return chunks
|
| 122 |
|
| 123 |
def get_chunk(self, chunked_codebase, chunk_number):
|
| 124 |
return chunked_codebase[chunk_number]
|
|
|
app.py
CHANGED
|
@@ -1,10 +1,28 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from utils import load_json, count_tokens
|
| 3 |
import json
|
|
|
|
| 4 |
|
| 5 |
# Set up the Streamlit page configuration
|
| 6 |
st.set_page_config(page_title="Cintra Code Chunker", layout="wide")
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
def main():
|
| 9 |
# Streamlit widgets for file selection
|
| 10 |
st.title("Cintra Code Chunker")
|
|
@@ -38,4 +56,4 @@ def main():
|
|
| 38 |
|
| 39 |
|
| 40 |
if __name__ == "__main__":
|
| 41 |
-
main()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from utils import load_json, count_tokens
|
| 3 |
import json
|
| 4 |
+
import os
|
| 5 |
|
| 6 |
# Set up the Streamlit page configuration
|
| 7 |
st.set_page_config(page_title="Cintra Code Chunker", layout="wide")
|
| 8 |
|
| 9 |
+
# Slider to select a value
|
| 10 |
+
x = st.slider("Select a value")
|
| 11 |
+
st.write(x, "squared is", x * x)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
code_files_directory = "example_code_files"
|
| 15 |
+
code_files = os.listdir(code_files_directory)
|
| 16 |
+
|
| 17 |
+
# Dropdown menu for the user to select a code file
|
| 18 |
+
selected_file = st.selectbox("Select a code file", code_files)
|
| 19 |
+
|
| 20 |
+
file_path = os.path.join(code_files_directory, selected_file)
|
| 21 |
+
with open(file_path, "r") as file:
|
| 22 |
+
code_content = file.read()
|
| 23 |
+
st.code(code_content, language="python")
|
| 24 |
+
|
| 25 |
+
|
| 26 |
def main():
|
| 27 |
# Streamlit widgets for file selection
|
| 28 |
st.title("Cintra Code Chunker")
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
if __name__ == "__main__":
|
| 59 |
+
main()
|
requirements.txt
CHANGED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
altair==5.3.0
|
| 2 |
+
attrs==23.2.0
|
| 3 |
+
blinker==1.7.0
|
| 4 |
+
cachetools==5.3.3
|
| 5 |
+
certifi==2024.2.2
|
| 6 |
+
charset-normalizer==3.3.2
|
| 7 |
+
click==8.1.7
|
| 8 |
+
colorama==0.4.6
|
| 9 |
+
gitdb==4.0.11
|
| 10 |
+
GitPython==3.1.43
|
| 11 |
+
idna==3.6
|
| 12 |
+
Jinja2==3.1.3
|
| 13 |
+
jsonschema==4.21.1
|
| 14 |
+
jsonschema-specifications==2023.12.1
|
| 15 |
+
markdown-it-py==3.0.0
|
| 16 |
+
MarkupSafe==2.1.5
|
| 17 |
+
mdurl==0.1.2
|
| 18 |
+
numpy==1.26.4
|
| 19 |
+
packaging==24.0
|
| 20 |
+
pandas==2.2.1
|
| 21 |
+
pillow==10.3.0
|
| 22 |
+
protobuf==4.25.3
|
| 23 |
+
pyarrow==15.0.2
|
| 24 |
+
pydeck==0.8.1b0
|
| 25 |
+
Pygments==2.17.2
|
| 26 |
+
python-dateutil==2.9.0.post0
|
| 27 |
+
pytz==2024.1
|
| 28 |
+
referencing==0.34.0
|
| 29 |
+
requests==2.31.0
|
| 30 |
+
rich==13.7.1
|
| 31 |
+
rpds-py==0.18.0
|
| 32 |
+
six==1.16.0
|
| 33 |
+
smmap==5.0.1
|
| 34 |
+
streamlit==1.33.0
|
| 35 |
+
tenacity==8.2.3
|
| 36 |
+
regex==2023.12.25
|
| 37 |
+
tiktoken==0.6.0
|
| 38 |
+
tree-sitter==0.21.3
|
| 39 |
+
toml==0.10.2
|
| 40 |
+
toolz==0.12.1
|
| 41 |
+
tornado==6.4
|
| 42 |
+
typing_extensions==4.11.0
|
| 43 |
+
tzdata==2024.1
|
| 44 |
+
urllib3==2.2.1
|
| 45 |
+
watchdog==4.0.0
|