blaise-tk commited on
Commit
6f74dd4
·
1 Parent(s): 2acca5f
Files changed (10) hide show
  1. .gitignore +5 -0
  2. LICENSE +7 -0
  3. README.md +54 -4
  4. app.py +65 -0
  5. main.py +131 -0
  6. repo2txt/__init__.py +0 -0
  7. repo2txt/cli.py +45 -0
  8. repo2txt/decoder.py +130 -0
  9. requirements.txt +1 -0
  10. setup.py +47 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.pyd
2
+
3
+ build/
4
+ dist/
5
+ *.egg-info/
LICENSE ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Copyright 2024 blaisewf
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4
+
5
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6
+
7
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md CHANGED
@@ -1,13 +1,63 @@
1
  ---
2
- title: Repo2txt
3
- emoji: 📚
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.39.0
8
  app_file: app.py
9
- pinned: false
 
10
  license: mit
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: repo2txt
3
+ emoji: 🌐
4
  colorFrom: green
5
  colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.39.0
8
  app_file: app.py
9
+ pinned: true
10
+ header: mini
11
  license: mit
12
  ---
13
 
14
+ # repo2txt
15
+
16
+ `repo2txt` is a Python package that clones a GitHub repository, generates a text file containing the repository's directory structure and the contents of all its files, and handles cleanup.
17
+
18
+ ## Installation
19
+
20
+ You can install `repo2txt` using pip:
21
+
22
+ ```sh
23
+ pip install git+https://github.com/blaisewf/repo2txt.git
24
+ ```
25
+
26
+ Alternatively, you can clone the repository and install it locally:
27
+
28
+ ```sh
29
+ git clone https://github.com/blaisewf/repo2txt.git
30
+ cd repo2txt
31
+ pip install .
32
+ ```
33
+
34
+ > [!WARNING]
35
+ > Git is required to clone the repository. If you don't have Git installed, you can download it from [git-scm.com](https://git-scm.com/).
36
+
37
+ ## Usage
38
+
39
+ Once installed, you can use the CLI command `repo2txt` to process a GitHub repository. Here’s the basic syntax:
40
+
41
+ ```sh
42
+ repo2txt --repo-url <repository_url> --output-file <output_file_path>
43
+ ```
44
+
45
+ ### Example
46
+
47
+ ```sh
48
+ repo2txt --repo-url https://github.com/example/repository.git --output-file output.txt
49
+ ```
50
+
51
+ This command will:
52
+
53
+ 1. Clone the repository from `https://github.com/example/repository.git`.
54
+ 2. Generate a text file `output.txt` containing the directory structure and contents of all files in the repository.
55
+ 3. Clean up the cloned repository directory.
56
+
57
+ ## License
58
+
59
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
60
+
61
+ ## References
62
+ - https://github.com/kirill-markin/repo-to-text
63
+
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from repo2txt.decoder import (
3
+ clone_repo,
4
+ get_directory_structure,
5
+ extract_all_files_contents,
6
+ write_output_file,
7
+ cleanup,
8
+ )
9
+
10
+
11
+ def process_repository(repo_url_or_shorthand):
12
+ """Process the GitHub repository and return the content of the output file."""
13
+ # Define the directory to clone into
14
+ clone_dir = "temp_repo"
15
+ output_file = "output.txt"
16
+
17
+ try:
18
+ # Clone the repository
19
+ clone_repo(repo_url_or_shorthand, clone_dir)
20
+
21
+ # Get directory structure and file contents
22
+ directory_structure = get_directory_structure(clone_dir)
23
+ file_contents = extract_all_files_contents(clone_dir)
24
+
25
+ # Write output to file
26
+ write_output_file(output_file, directory_structure, file_contents)
27
+
28
+ # Read the content of the output file
29
+ with open(output_file, "r", encoding="utf-8") as file:
30
+ output_content = file.read()
31
+
32
+ # Cleanup
33
+ cleanup(clone_dir)
34
+ # Return the output file path for Gradio
35
+ return output_content, output_file
36
+
37
+ except Exception as e:
38
+ return f"An error occurred: {e}", None
39
+
40
+
41
+ # Define Gradio interface
42
+ with gr.Blocks(title="repo2txt") as demo:
43
+ gr.Markdown("# repo2txt")
44
+
45
+ with gr.Row():
46
+ repo_url_input = gr.Textbox(
47
+ label="GitHub Repository URL or Shorthand",
48
+ placeholder="e.g., user/repo or https://github.com/user/repo",
49
+ )
50
+
51
+ process_button = gr.Button("Process Repository")
52
+ txt_output = gr.File(label="Download txt file")
53
+ result_output = gr.Textbox(
54
+ label="Result",
55
+ lines=1,
56
+ placeholder="Processing result will be shown here",
57
+ )
58
+
59
+ process_button.click(
60
+ process_repository, inputs=repo_url_input, outputs=[result_output, txt_output]
61
+ )
62
+
63
+ # Launch the interface
64
+ if __name__ == "__main__":
65
+ demo.launch()
main.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ import argparse
5
+
6
+
7
+ def clone_repo(repo_url, clone_dir):
8
+ """Clone the GitHub repository into the specified directory."""
9
+ subprocess.run(["git", "clone", repo_url, clone_dir], check=True)
10
+
11
+
12
+ def extract_repo_name_from_url(repo_url):
13
+ """Extract the repository name from the GitHub URL."""
14
+ repo_name = repo_url.rstrip("/").split("/")[-1]
15
+ return repo_name.split(".")[0] if "." in repo_name else repo_name
16
+
17
+
18
+ def get_directory_structure(root_dir):
19
+ """Get the directory structure in a tree format, ignoring .git directory."""
20
+ lines = []
21
+ for root, dirs, files in os.walk(root_dir):
22
+ if ".git" in dirs:
23
+ dirs.remove(".git") # Avoid walking into .git directory
24
+
25
+ level = root.replace(root_dir, "").count(os.sep)
26
+ indent = " " * 4 * level
27
+ lines.append(f"{indent}├── {os.path.basename(root)}/")
28
+
29
+ subindent = " " * 4 * (level + 1)
30
+ for file in files:
31
+ lines.append(f"{subindent}├── {file}")
32
+ return "\n".join(lines)
33
+
34
+
35
+ def read_file_contents(file_path):
36
+ """Read the contents of a file, ignore if in .git directory."""
37
+ if ".git" in file_path:
38
+ return "[Ignored .git directory]"
39
+
40
+ try:
41
+ with open(file_path, "r", encoding="utf-8") as file:
42
+ return file.read()
43
+ except (UnicodeDecodeError, OSError) as e:
44
+ return f"[Error reading file: {e}]"
45
+
46
+
47
+ def extract_all_files_contents(root_dir):
48
+ """Extract contents of all files in the directory, ignoring .git directory."""
49
+ file_contents = {}
50
+ for root, _, files in os.walk(root_dir):
51
+ if ".git" in root:
52
+ continue
53
+
54
+ for file_name in files:
55
+ file_path = os.path.join(root, file_name)
56
+ relative_path = os.path.relpath(file_path, root_dir)
57
+ file_contents[relative_path] = read_file_contents(file_path)
58
+ return file_contents
59
+
60
+
61
+ def count_tokens(text):
62
+ """Count the number of tokens in a given text."""
63
+ return len(text.split())
64
+
65
+
66
+ def write_output_file(output_file, directory_structure, file_contents):
67
+ """Write the directory structure and file contents to the output file with metadata."""
68
+ total_lines = directory_structure.count("\n") + sum(
69
+ content.count("\n") for content in file_contents.values()
70
+ )
71
+ total_chars = len(directory_structure) + sum(
72
+ len(content) for content in file_contents.values()
73
+ )
74
+
75
+ with open(output_file, "w", encoding="utf-8") as file:
76
+ file.write(f"Lines: {total_lines}\nCharacters: {total_chars}\n\n")
77
+ file.write("Directory Structure:\n```\n")
78
+ file.write(directory_structure)
79
+ file.write("\n```\n")
80
+
81
+ for file_path, content in file_contents.items():
82
+ file.write(f"\nContents of {file_path}:\n```\n")
83
+ file.write(content)
84
+ file.write("\n```\n")
85
+
86
+
87
+ def cleanup(clone_dir):
88
+ """Remove the cloned repository directory with error handling."""
89
+ if os.path.exists(clone_dir):
90
+ try:
91
+ shutil.rmtree(clone_dir, onerror=handle_remove_error)
92
+ except Exception as e:
93
+ print(f"An error occurred while cleaning up: {e}")
94
+
95
+
96
+ def handle_remove_error(func, path, exc_info):
97
+ """Error handler for shutil.rmtree to handle permission errors."""
98
+ import stat
99
+
100
+ if isinstance(exc_info[1], PermissionError):
101
+ os.chmod(path, stat.S_IWRITE)
102
+ func(path)
103
+ else:
104
+ print(f"Error removing {path}: {exc_info[1]}")
105
+
106
+
107
+ def main():
108
+ parser = argparse.ArgumentParser(
109
+ description="Generate a text file with repository structure and all file contents."
110
+ )
111
+ parser.add_argument("repo_url", help="URL of the GitHub repository to process.")
112
+ parser.add_argument("output_file", help="Path to the output text file.")
113
+ args = parser.parse_args()
114
+
115
+ repo_url = args.repo_url
116
+ output_file = args.output_file
117
+
118
+ repo_name = extract_repo_name_from_url(repo_url)
119
+ clone_dir = repo_name
120
+
121
+ clone_repo(repo_url, clone_dir)
122
+
123
+ directory_structure = get_directory_structure(clone_dir)
124
+ file_contents = extract_all_files_contents(clone_dir)
125
+
126
+ write_output_file(output_file, directory_structure, file_contents)
127
+ cleanup(clone_dir)
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
repo2txt/__init__.py ADDED
File without changes
repo2txt/cli.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import click
2
+ from repo2txt.decoder import (
3
+ clone_repo,
4
+ extract_repo_name_from_url,
5
+ get_directory_structure,
6
+ extract_all_files_contents,
7
+ write_output_file,
8
+ cleanup,
9
+ )
10
+
11
+
12
+ @click.command()
13
+ @click.option(
14
+ "--repo-url",
15
+ prompt="Repository URL",
16
+ help="URL of the GitHub repository to process.",
17
+ )
18
+ @click.option(
19
+ "--output-file", prompt="Output file path", help="Path to the output text file."
20
+ )
21
+ def cli(repo_url, output_file):
22
+ """CLI entry point for generating a text file with repository structure and all file contents."""
23
+ repo_name = extract_repo_name_from_url(repo_url)
24
+ clone_dir = repo_name
25
+
26
+ click.echo(f"Cloning repository {repo_url} into directory {clone_dir}...")
27
+ clone_repo(repo_url, clone_dir)
28
+
29
+ click.echo("Generating directory structure...")
30
+ directory_structure = get_directory_structure(clone_dir)
31
+
32
+ click.echo("Extracting file contents...")
33
+ file_contents = extract_all_files_contents(clone_dir)
34
+
35
+ click.echo(f"Writing output to {output_file}...")
36
+ write_output_file(output_file, directory_structure, file_contents)
37
+
38
+ click.echo("Cleaning up...")
39
+ cleanup(clone_dir)
40
+
41
+ click.echo("Process completed successfully.")
42
+
43
+
44
+ if __name__ == "__main__":
45
+ cli()
repo2txt/decoder.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+
5
+
6
+ def is_git_installed():
7
+ """Check if Git is installed."""
8
+ try:
9
+ subprocess.run(
10
+ ["git", "--version"],
11
+ stdout=subprocess.PIPE,
12
+ stderr=subprocess.PIPE,
13
+ check=True,
14
+ )
15
+ return True
16
+ except FileNotFoundError:
17
+ return False
18
+
19
+
20
+ def convert_to_full_url(repo_url_or_shorthand):
21
+ """Convert shorthand 'user/repo' format to full URL format."""
22
+ if repo_url_or_shorthand.startswith("http"):
23
+ return repo_url_or_shorthand # Already a full URL
24
+ return f"https://github.com/{repo_url_or_shorthand}"
25
+
26
+
27
+ def clone_repo(repo_url_or_shorthand, clone_dir):
28
+ """Clone the GitHub repository into the specified directory."""
29
+ repo_url = convert_to_full_url(repo_url_or_shorthand)
30
+ if is_git_installed():
31
+ subprocess.run(["git", "clone", repo_url, clone_dir], check=True)
32
+ else:
33
+ raise RuntimeError(
34
+ "Git is not installed. Please install Git to clone repositories."
35
+ )
36
+
37
+
38
+ def extract_repo_name_from_url(repo_url):
39
+ """Extract the repository name from the GitHub URL."""
40
+ repo_name = repo_url.rstrip("/").split("/")[-1]
41
+ return repo_name.split(".")[0] if "." in repo_name else repo_name
42
+
43
+
44
+ def get_directory_structure(root_dir):
45
+ """Get the directory structure in a tree format, ignoring .git directory."""
46
+ lines = []
47
+ for root, dirs, files in os.walk(root_dir):
48
+ if ".git" in dirs:
49
+ dirs.remove(".git") # Avoid walking into .git directory
50
+
51
+ level = root.replace(root_dir, "").count(os.sep)
52
+ indent = " " * 4 * level
53
+ lines.append(f"{indent}├── {os.path.basename(root)}/")
54
+
55
+ subindent = " " * 4 * (level + 1)
56
+ for file in files:
57
+ lines.append(f"{subindent}├── {file}")
58
+ return "\n".join(lines)
59
+
60
+
61
+ def read_file_contents(file_path):
62
+ """Read the contents of a file, ignore if in .git directory."""
63
+ if ".git" in file_path:
64
+ return "[Ignored .git directory]"
65
+
66
+ try:
67
+ with open(file_path, "r", encoding="utf-8") as file:
68
+ return file.read()
69
+ except (UnicodeDecodeError, OSError) as e:
70
+ return f"[Error reading file: {e}]"
71
+
72
+
73
+ def extract_all_files_contents(root_dir):
74
+ """Extract contents of all files in the directory, ignoring .git directory."""
75
+ file_contents = {}
76
+ for root, _, files in os.walk(root_dir):
77
+ if ".git" in root:
78
+ continue
79
+
80
+ for file_name in files:
81
+ file_path = os.path.join(root, file_name)
82
+ relative_path = os.path.relpath(file_path, root_dir)
83
+ file_contents[relative_path] = read_file_contents(file_path)
84
+ return file_contents
85
+
86
+
87
+ def count_tokens(text):
88
+ """Count the number of tokens in a given text."""
89
+ return len(text.split())
90
+
91
+
92
+ def write_output_file(output_file, directory_structure, file_contents):
93
+ """Write the directory structure and file contents to the output file with metadata."""
94
+ total_lines = directory_structure.count("\n") + sum(
95
+ content.count("\n") for content in file_contents.values()
96
+ )
97
+ total_chars = len(directory_structure) + sum(
98
+ len(content) for content in file_contents.values()
99
+ )
100
+
101
+ with open(output_file, "w", encoding="utf-8") as file:
102
+ file.write(f"Lines: {total_lines}\nCharacters: {total_chars}\n\n")
103
+ file.write("Directory Structure:\n```\n")
104
+ file.write(directory_structure)
105
+ file.write("\n```\n")
106
+
107
+ for file_path, content in file_contents.items():
108
+ file.write(f"\nContents of {file_path}:\n```\n")
109
+ file.write(content)
110
+ file.write("\n```\n")
111
+
112
+
113
+ def cleanup(clone_dir):
114
+ """Remove the cloned repository directory with error handling."""
115
+ if os.path.exists(clone_dir):
116
+ try:
117
+ shutil.rmtree(clone_dir, onerror=handle_remove_error)
118
+ except Exception as e:
119
+ print(f"An error occurred while cleaning up: {e}")
120
+
121
+
122
+ def handle_remove_error(func, path, exc_info):
123
+ """Error handler for shutil.rmtree to handle permission errors."""
124
+ import stat
125
+
126
+ if isinstance(exc_info[1], PermissionError):
127
+ os.chmod(path, stat.S_IWRITE)
128
+ func(path)
129
+ else:
130
+ print(f"Error removing {path}: {exc_info[1]}")
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gradio
setup.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+ import codecs
3
+ import os
4
+
5
+ here = os.path.abspath(os.path.dirname(__file__))
6
+
7
+ with codecs.open(os.path.join(here, "README.md"), encoding="utf-8") as fh:
8
+ long_description = "\n" + fh.read()
9
+
10
+ setup(
11
+ name="repo2txt",
12
+ version="0.0.1",
13
+ author="Blaise",
14
+ author_email="[email protected]",
15
+ description="A tool to clone GitHub repositories, document their directory structure, and extract file contents into a text file.",
16
+ long_description=long_description,
17
+ long_description_content_type="text/markdown",
18
+ license="MIT",
19
+ packages=find_packages(),
20
+ install_requires=[
21
+ "click",
22
+ ],
23
+ entry_points={
24
+ "console_scripts": [
25
+ "repo2txt=repo2txt.cli:cli",
26
+ ],
27
+ },
28
+ url="https://github.com/blaise-tk/repo2txt",
29
+ keywords=[
30
+ "GitHub",
31
+ "repository",
32
+ "ai",
33
+ "clone",
34
+ "directory structure",
35
+ "file extraction",
36
+ "documentation",
37
+ "CLI tool",
38
+ ],
39
+ classifiers=[
40
+ "Development Status :: Release",
41
+ "Intended Audience :: Developers",
42
+ "Programming Language :: Python :: 3",
43
+ "Operating System :: MacOS :: MacOS X",
44
+ "Operating System :: Microsoft :: Windows :: Windows 10",
45
+ "Operating System :: POSIX :: Linux",
46
+ ],
47
+ )