noumanjavaid commited on
Commit
ad33df7
·
verified ·
1 Parent(s): 0e07a73

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .commitlintrc +37 -0
  2. .dockerignore +13 -0
  3. .env.example +44 -0
  4. .gitattributes +4 -35
  5. .github/ISSUE_TEMPLATE/bug_report.yml +85 -0
  6. .github/ISSUE_TEMPLATE/config.yml +1 -0
  7. .github/ISSUE_TEMPLATE/feature_request.yml +49 -0
  8. .github/PULL_REQUEST_TEMPLATE.md +18 -0
  9. .github/workflows/auto-bump-and-release.yaml +62 -0
  10. .github/workflows/build-push-docker.yaml +103 -0
  11. .github/workflows/pr-lint.yaml +77 -0
  12. .github/workflows/style-check.yaml +20 -0
  13. .github/workflows/unit-test.yaml +109 -0
  14. .gitignore +479 -0
  15. .pre-commit-config.yaml +69 -0
  16. .python-version +1 -0
  17. CODE_OF_CONDUCT.md +128 -0
  18. CONTRIBUTING.md +115 -0
  19. Dockerfile +99 -0
  20. LICENSE.txt +201 -0
  21. README.md +365 -8
  22. app.py +25 -0
  23. doc_env_reqs.txt +9 -0
  24. docs/about.md +14 -0
  25. docs/development/contributing.md +116 -0
  26. docs/development/create-a-component.md +71 -0
  27. docs/development/data-components.md +34 -0
  28. docs/development/index.md +1 -0
  29. docs/development/utilities.md +169 -0
  30. docs/extra/css/code_select.css +5 -0
  31. docs/images/269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png +0 -0
  32. docs/images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png +0 -0
  33. docs/images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png +0 -0
  34. docs/images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png +0 -0
  35. docs/images/change_space_params.png +0 -0
  36. docs/images/chat-demo.gif +3 -0
  37. docs/images/chat-tab-demo.png +0 -0
  38. docs/images/chat-tab.png +0 -0
  39. docs/images/close_logs_space.png +0 -0
  40. docs/images/cohere_api_key.png +0 -0
  41. docs/images/duplicate_space.png +0 -0
  42. docs/images/file-index-tab.png +0 -0
  43. docs/images/index-embedding.png +0 -0
  44. docs/images/info-panel-scores.png +0 -0
  45. docs/images/initial_startup.png +0 -0
  46. docs/images/llm-default.png +0 -0
  47. docs/images/models.png +0 -0
  48. docs/images/pdf-viewer-setup.png +0 -0
  49. docs/images/preview-graph.png +0 -0
  50. docs/images/preview.png +0 -0
.commitlintrc ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "extends": ["@commitlint/config-conventional"],
3
+ "defaultIgnores": true,
4
+ "rules": {
5
+ "body-leading-blank": [1, "always"],
6
+ "body-max-line-length": [2, "always", 100],
7
+ "footer-leading-blank": [1, "always"],
8
+ "footer-max-line-length": [2, "always", 10000],
9
+ "header-max-length": [2, "always", 200],
10
+ "subject-case": [
11
+ 2,
12
+ "never",
13
+ []
14
+ ],
15
+ "subject-empty": [2, "never"],
16
+ "subject-full-stop": [2, "never", "."],
17
+ "type-case": [2, "always", "lower-case"],
18
+ "type-empty": [2, "never"],
19
+ "type-enum": [
20
+ 2,
21
+ "always",
22
+ [
23
+ "build",
24
+ "chore",
25
+ "ci",
26
+ "docs",
27
+ "feat",
28
+ "fix",
29
+ "perf",
30
+ "refactor",
31
+ "revert",
32
+ "style",
33
+ "test"
34
+ ]
35
+ ]
36
+ }
37
+ }
.dockerignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .github/
2
+ .git/
3
+ .mypy_cache/
4
+ __pycache__/
5
+ ktem_app_data/
6
+ env/
7
+ .pre-commit-config.yaml
8
+ .commitlintrc
9
+ .gitignore
10
+ .gitattributes
11
+ README.md
12
+ *.zip
13
+ *.sh
.env.example ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this is an example .env file, use it to create your own .env file and place it in the root of the project
2
+
3
+ # settings for OpenAI
4
+ OPENAI_API_BASE=https://api.openai.com/v1
5
+ OPENAI_API_KEY=<YOUR_OPENAI_KEY>
6
+ OPENAI_CHAT_MODEL=gpt-3.5-turbo
7
+ OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
8
+
9
+ # settings for Azure OpenAI
10
+ AZURE_OPENAI_ENDPOINT=
11
+ AZURE_OPENAI_API_KEY=
12
+ OPENAI_API_VERSION=2024-02-15-preview
13
+ AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
14
+ AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002
15
+
16
+ # settings for Cohere
17
+ COHERE_API_KEY=<COHERE_API_KEY>
18
+
19
+ # settings for local models
20
+ LOCAL_MODEL=llama3.1:8b
21
+ LOCAL_MODEL_EMBEDDINGS=nomic-embed-text
22
+ LOCAL_EMBEDDING_MODEL_DIM = 768
23
+ LOCAL_EMBEDDING_MODEL_MAX_TOKENS = 8192
24
+
25
+ # settings for GraphRAG
26
+ GRAPHRAG_API_KEY=<YOUR_OPENAI_KEY>
27
+ GRAPHRAG_LLM_MODEL=gpt-4o-mini
28
+ GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small
29
+
30
+ # set to true if you want to use customized GraphRAG config file
31
+ USE_CUSTOMIZED_GRAPHRAG_SETTING=false
32
+
33
+ # settings for Azure DI
34
+ AZURE_DI_ENDPOINT=
35
+ AZURE_DI_CREDENTIAL=
36
+
37
+ # settings for Adobe API
38
+ # get free credential at https://acrobatservices.adobe.com/dc-integration-creation-app-cdn/main.html?api=pdf-extract-api
39
+ # also install pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
40
+ PDF_SERVICES_CLIENT_ID=
41
+ PDF_SERVICES_CLIENT_SECRET=
42
+
43
+ # settings for PDF.js
44
+ PDFJS_VERSION_DIST="pdfjs-4.0.379-dist"
.gitattributes CHANGED
@@ -1,35 +1,4 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.bat text eol=crlf
2
+ docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text
3
+ kotaemon/docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text
4
+ kotaemon/kotaemon/docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.github/ISSUE_TEMPLATE/bug_report.yml ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Bug Report"
2
+ description: Report something that is not working as expected
3
+ title: "[BUG] "
4
+ labels: ["bug"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ *Please fill this form with as much information as possible.*
10
+ - type: textarea
11
+ id: description
12
+ attributes:
13
+ label: "Description"
14
+ description: Please enter an explicit description of your issue
15
+ placeholder: Short and explicit description of your incident...
16
+ validations:
17
+ required: true
18
+ - type: textarea
19
+ id: reprod
20
+ attributes:
21
+ label: "Reproduction steps"
22
+ description: Please enter an explicit description of your issue
23
+ value: |
24
+ 1. Go to '...'
25
+ 2. Click on '....'
26
+ 3. Scroll down to '....'
27
+ 4. See error
28
+ render: bash
29
+ validations:
30
+ required: true
31
+ - type: textarea
32
+ id: screenshot
33
+ attributes:
34
+ label: "Screenshots"
35
+ description: If applicable, add screenshots to help explain your problem.
36
+ value: |
37
+ ![DESCRIPTION](LINK.png)
38
+ render: bash
39
+ validations:
40
+ required: false
41
+ - type: textarea
42
+ id: logs
43
+ attributes:
44
+ label: "Logs"
45
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
46
+ render: bash
47
+ validations:
48
+ required: false
49
+ - type: dropdown
50
+ id: browsers
51
+ attributes:
52
+ label: "Browsers"
53
+ description: What browsers are you seeing the problem on ?
54
+ multiple: true
55
+ options:
56
+ - Firefox
57
+ - Chrome
58
+ - Safari
59
+ - Microsoft Edge
60
+ - Opera
61
+ - Brave
62
+ - Other
63
+ validations:
64
+ required: false
65
+ - type: dropdown
66
+ id: os
67
+ attributes:
68
+ label: "OS"
69
+ description: What is the impacted environment ?
70
+ multiple: true
71
+ options:
72
+ - Windows
73
+ - MacOS
74
+ - Linux
75
+ - Other
76
+ validations:
77
+ required: false
78
+ - type: textarea
79
+ id: additional_information
80
+ attributes:
81
+ label: "Additional information"
82
+ description: Add any relevant information or context.
83
+ placeholder:
84
+ validations:
85
+ required: false
.github/ISSUE_TEMPLATE/config.yml ADDED
@@ -0,0 +1 @@
 
 
1
+ blank_issues_enabled: false
.github/ISSUE_TEMPLATE/feature_request.yml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Feature Request"
2
+ description: Brainstorm and propose new features for the project
3
+ title: "[REQUEST] "
4
+ labels: ["enhancement"]
5
+ body:
6
+ - type: markdown
7
+ attributes:
8
+ value: |
9
+ *Please fill this form with as much information as possible.*
10
+ - type: textarea
11
+ id: reference_issues
12
+ attributes:
13
+ label: "Reference Issues"
14
+ description: Common issues
15
+ placeholder: "#Issues IDs"
16
+ validations:
17
+ required: false
18
+ - type: textarea
19
+ id: summary
20
+ attributes:
21
+ label: "Summary"
22
+ description: Provide a brief explanation of the feature
23
+ placeholder: Describe in a few lines your feature request
24
+ validations:
25
+ required: true
26
+ - type: textarea
27
+ id: basic_example
28
+ attributes:
29
+ label: "Basic Example"
30
+ description: Indicate here some basic examples of your feature.
31
+ placeholder: A few specific words about your feature request.
32
+ validations:
33
+ required: true
34
+ - type: textarea
35
+ id: drawbacks
36
+ attributes:
37
+ label: "Drawbacks"
38
+ description: What are the drawbacks/impacts of your feature request ?
39
+ placeholder: Identify the drawbacks and impacts while being neutral on your feature request
40
+ validations:
41
+ required: true
42
+ - type: textarea
43
+ id: additional_information
44
+ attributes:
45
+ label: "Additional information"
46
+ description: Add any additional information that you think is important for your feature request
47
+ placeholder:
48
+ validations:
49
+ required: false
.github/PULL_REQUEST_TEMPLATE.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Description
2
+
3
+ - Please include a summary of the changes and the related issue.
4
+ - Fixes # (issue)
5
+
6
+ ## Type of change
7
+
8
+ - [ ] New features (non-breaking change).
9
+ - [ ] Bug fix (non-breaking change).
10
+ - [ ] Breaking change (fix or feature that would cause existing functionality not to work as expected).
11
+
12
+ ## Checklist
13
+
14
+ - [ ] I have performed a self-review of my code.
15
+ - [ ] I have added thorough tests if it is a core feature.
16
+ - [ ] There is a reference to the original bug report and related work.
17
+ - [ ] I have commented on my code, particularly in hard-to-understand areas.
18
+ - [ ] The feature is well documented.
.github/workflows/auto-bump-and-release.yaml ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Auto Bump and Release
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+
8
+ jobs:
9
+ auto-bump-and-release:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Clone the repo
13
+ uses: actions/checkout@v4
14
+ with:
15
+ fetch-depth: 0
16
+ - name: Update Application Version
17
+ id: update-version
18
+ uses: anothrNick/github-tag-action@v1
19
+ env:
20
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
21
+ WITH_V: true
22
+ DEFAULT_BUMP: patch
23
+ MAJOR_STRING_TOKEN: "bump:major"
24
+ MINOR_STRING_TOKEN: "bump:minor"
25
+ PATCH_STRING_TOKEN: "bump:patch"
26
+ - name: Create release for ${{ steps.update-version.outputs.new_tag }}
27
+ # need to repeat this if statement because Github Action doesn't support early
28
+ # stopping for steps
29
+ if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
30
+ run: |
31
+ echo Create release folder
32
+ mkdir kotaemon-app
33
+ echo ${{ steps.update-version.outputs.new_tag }} > kotaemon-app/VERSION
34
+ cp LICENSE.txt kotaemon-app/
35
+ cp flowsettings.py kotaemon-app/
36
+ cp app.py kotaemon-app/
37
+ cp .env.example kotaemon-app/.env
38
+ cp -r scripts kotaemon-app/
39
+ mkdir -p kotaemon-app/libs/ktem/ktem/
40
+ cp -r libs/ktem/ktem/assets kotaemon-app/libs/ktem/ktem/
41
+
42
+ tree kotaemon-app
43
+ zip -r kotaemon-app.zip kotaemon-app
44
+ - name: Release ${{ steps.update-version.outputs.new_tag }}
45
+ if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
46
+ uses: softprops/action-gh-release@v2
47
+ with:
48
+ files: kotaemon-app.zip
49
+ fail_on_unmatched_files: true
50
+ token: ${{ secrets.GITHUB_TOKEN }}
51
+ generate_release_notes: true
52
+ tag_name: ${{ steps.update-version.outputs.new_tag }}
53
+ make_latest: true
54
+ - name: Setup latest branch locally without switching current branch
55
+ if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
56
+ run: git fetch origin latest:latest
57
+ - name: Update latest branch
58
+ if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
59
+ run: |
60
+ git branch -f latest tags/${{ steps.update-version.outputs.new_tag }}
61
+ git checkout latest
62
+ git push -f -u origin latest
.github/workflows/build-push-docker.yaml ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build and Push Docker Image
2
+
3
+ on:
4
+ release:
5
+ types:
6
+ - created
7
+
8
+ push:
9
+ tags:
10
+ - "v[0-9]+.[0-9]+.[0-9]+"
11
+
12
+ workflow_dispatch:
13
+
14
+ env:
15
+ REGISTRY: ghcr.io
16
+
17
+ jobs:
18
+ build:
19
+ name: Build and push container
20
+ runs-on: ubuntu-latest
21
+ permissions:
22
+ contents: read
23
+ packages: write
24
+ attestations: write
25
+ id-token: write
26
+ strategy:
27
+ matrix:
28
+ target:
29
+ - lite
30
+ - full
31
+ steps:
32
+ - name: Free Disk Space (Ubuntu)
33
+ uses: jlumbroso/free-disk-space@main
34
+ with:
35
+ # this might remove tools that are actually needed,
36
+ # if set to "true" but frees about 6 GB
37
+ tool-cache: true
38
+
39
+ # all of these default to true, but feel free to set to
40
+ # "false" if necessary for your workflow
41
+ android: true
42
+ dotnet: true
43
+ haskell: true
44
+ large-packages: true
45
+ docker-images: true
46
+ swap-storage: true
47
+
48
+ - name: Set repository and image name
49
+ run: |
50
+ echo "FULL_IMAGE_NAME=${{ env.REGISTRY }}/${IMAGE_NAME,,}" >>${GITHUB_ENV}
51
+ env:
52
+ IMAGE_NAME: "${{ github.repository }}"
53
+
54
+ - name: Checkout
55
+ uses: actions/checkout@v4
56
+
57
+ - name: Set up QEMU
58
+ uses: docker/setup-qemu-action@v3
59
+ with:
60
+ image: tonistiigi/binfmt:latest
61
+ platforms: arm64,arm
62
+
63
+ - name: Set up Docker Buildx
64
+ id: buildx
65
+ uses: docker/setup-buildx-action@v3
66
+
67
+ - name: Set up Docker meta
68
+ id: meta
69
+ uses: docker/metadata-action@v5
70
+ with:
71
+ images: ${{ env.FULL_IMAGE_NAME }}
72
+ tags: |
73
+ # branch
74
+ type=ref,event=branch,suffix=-${{ matrix.target }}
75
+ # semver with suffix for lite/full targets
76
+ type=semver,pattern={{version}},suffix=-${{ matrix.target }}
77
+ # latest tag with suffix for lite/full targets
78
+ type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, 'pre') }},suffix=-${{ matrix.target }}
79
+ flavor: |
80
+ # This is disabled here so we can use the raw form above
81
+ latest=false
82
+ # Suffix is not used here since there's no way to disable it above
83
+
84
+ - name: Log in to the Container registry
85
+ uses: docker/login-action@v3
86
+ with:
87
+ registry: ${{ env.REGISTRY }}
88
+ username: ${{ github.actor }}
89
+ password: ${{ secrets.GITHUB_TOKEN }}
90
+
91
+ - name: Build docker image
92
+ uses: docker/build-push-action@v6
93
+ with:
94
+ file: Dockerfile
95
+ context: .
96
+ push: true
97
+ platforms: linux/amd64, linux/arm64
98
+ tags: |
99
+ ${{ steps.meta.outputs.tags }}
100
+ labels: ${{ steps.meta.outputs.labels }}
101
+ target: ${{ matrix.target }}
102
+ cache-from: type=gha
103
+ cache-to: type=gha,mode=max
.github/workflows/pr-lint.yaml ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "Lint PR"
2
+
3
+ on:
4
+ pull_request:
5
+ types:
6
+ - opened
7
+ - edited
8
+ - synchronize
9
+
10
+ permissions:
11
+ pull-requests: write
12
+
13
+ jobs:
14
+ pr-title:
15
+ name: Validate PR title
16
+ runs-on: ubuntu-latest
17
+ permissions: write-all
18
+ steps:
19
+ - uses: amannn/action-semantic-pull-request@v5
20
+ id: lint_pr_title
21
+ env:
22
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
23
+
24
+ - uses: marocchino/sticky-pull-request-comment@v2
25
+ # When the previous steps fails, the workflow would stop. By adding this
26
+ # condition you can continue the execution with the populated error message.
27
+ if: always() && (steps.lint_pr_title.outputs.error_message != null)
28
+ with:
29
+ header: pr-title-lint-error
30
+ message: |
31
+ Hey there and thank you for opening this pull request! 👋🏼
32
+
33
+ We require pull request titles to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and it looks like your proposed title needs to be adjusted.
34
+ Details:
35
+ ```
36
+ ${{ steps.lint_pr_title.outputs.error_message }}
37
+ ```
38
+
39
+ # Delete a previous comment when the issue has been resolved
40
+ - if: ${{ steps.lint_pr_title.outputs.error_message == null }}
41
+ uses: marocchino/sticky-pull-request-comment@v2
42
+ with:
43
+ header: pr-title-lint-error
44
+ delete: true
45
+
46
+ commitlint:
47
+ if: false # Disable this job for now
48
+ name: Validate commit messages
49
+ runs-on: ubuntu-latest
50
+ permissions: write-all
51
+ steps:
52
+ - uses: actions/checkout@v4
53
+ - uses: wagoid/commitlint-github-action@v6
54
+ id: commitlint
55
+ env:
56
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
57
+ with:
58
+ configFile: ./.commitlintrc
59
+ - uses: buildingcash/json-to-markdown-table-action@v1
60
+ if: always() && (steps.commitlint.outcome != 'success')
61
+ id: table
62
+ with:
63
+ json: ${{ steps.commitlint.outputs.results }}
64
+ - uses: marocchino/sticky-pull-request-comment@v2
65
+ if: always() && (steps.commitlint.outcome != 'success')
66
+ with:
67
+ header: commitlint-error
68
+ message: |
69
+ **All commits** in this PR need to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and [.commitlintrc](${{ github.server_url }}/${{ github.repository }}/blob/${{ github.head_ref || github.ref_name }}/.commitlintrc).
70
+ Details:
71
+ ${{ steps.table.outputs.table }}
72
+
73
+ - if: ${{ steps.commitlint.outcome == 'success' }}
74
+ uses: marocchino/sticky-pull-request-comment@v2
75
+ with:
76
+ header: commitlint-error
77
+ delete: true
.github/workflows/style-check.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: style-check
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main, develop]
6
+ push:
7
+ branches: [main, develop]
8
+
9
+ jobs:
10
+ pre-commit:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - name: Clone the repo
14
+ uses: actions/checkout@v4
15
+ - name: Setup python
16
+ uses: actions/setup-python@v4
17
+ with:
18
+ python-version: "3.10"
19
+ - name: run pre-commit
20
+ uses: pre-commit/[email protected]
.github/workflows/unit-test.yaml ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: unit-test
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+ push:
7
+ branches: [main]
8
+
9
+ env:
10
+ THEFLOW_TEMP_PATH: ./tmp
11
+
12
+ jobs:
13
+ unit-test:
14
+ # if: false # temporary disable this job due to legacy interface
15
+ #TODO: enable this job after the new interface is ready
16
+ if: ${{ !cancelled() }}
17
+ runs-on: ${{ matrix.os }}
18
+ timeout-minutes: 20
19
+ defaults:
20
+ run:
21
+ shell: ${{ matrix.shell }}
22
+ strategy:
23
+ matrix:
24
+ python-version: ["3.10", "3.11"]
25
+ include:
26
+ - os: ubuntu-latest
27
+ shell: bash
28
+ ACTIVATE_ENV: ". env/bin/activate"
29
+ GITHUB_OUTPUT: "$GITHUB_OUTPUT"
30
+ # - os: windows-latest
31
+ # shell: pwsh
32
+ # ACTIVATE_ENV: env/Scripts/activate.ps1
33
+ # GITHUB_OUTPUT: "$env:GITHUB_OUTPUT"
34
+
35
+ name: unit testing with python ${{ matrix.python-version }}
36
+ steps:
37
+ - name: Clone the repo
38
+ uses: actions/checkout@v4
39
+ with:
40
+ ref: ${{ github.event.pull_request.head.sha }}
41
+
42
+ - name: Get Head Commit Message
43
+ id: get-head-commit-message
44
+ run: echo "message=$(git show -s --format=%s)" | tee -a ${{ matrix.GITHUB_OUTPUT }}
45
+
46
+ - name: Check ignore caching
47
+ id: check-ignore-cache
48
+ run: |
49
+ ignore_cache=${{ contains(steps.get-head-commit-message.outputs.message, '[ignore cache]') }}
50
+ echo "check=$ignore_cache" | tee -a ${{ matrix.GITHUB_OUTPUT }}
51
+
52
+ - name: Set up Python ${{ matrix.python-version }} on ${{ runner.os }}
53
+ uses: actions/setup-python@v4
54
+ id: setup_python
55
+ with:
56
+ python-version: ${{ matrix.python-version }}
57
+ architecture: x64
58
+
59
+ - name: Get cache key
60
+ id: get-cache-key
61
+ run: |
62
+ pip install "setuptools-git-versioning>=2.0,<3"
63
+ package_version=$(setuptools-git-versioning)
64
+ cache_key="${{ runner.os }}-py${{ matrix.python-version }}-v${package_version}"
65
+ echo "key=$cache_key" | tee -a ${{ matrix.GITHUB_OUTPUT }}
66
+
67
+ - name: Try to restore dependencies from ${{ steps.get-cache-key.outputs.key }}
68
+ id: restore-dependencies
69
+ if: steps.check-ignore-cache.outputs.check != 'true'
70
+ uses: actions/cache/restore@v3
71
+ with:
72
+ path: ${{ env.pythonLocation }}
73
+ key: ${{ steps.get-cache-key.outputs.key }}
74
+ # could using cache of previous ver to reuse unchanged packages
75
+ restore-keys: ${{ runner.os }}-py${{ matrix.python-version }}
76
+
77
+ - name: Check cache hit
78
+ id: check-cache-hit
79
+ run: |
80
+ echo "cache-hit=${{ steps.restore-dependencies.outputs.cache-hit }}"
81
+ echo "cache-matched-key=${{ steps.restore-dependencies.outputs.cache-matched-key }}"
82
+ cache_hit=${{ steps.restore-dependencies.outputs.cache-primary-key == steps.restore-dependencies.outputs.cache-matched-key }}
83
+ echo "check=$cache_hit" | tee -a ${{ matrix.GITHUB_OUTPUT }}
84
+
85
+ - name: Install additional dependencies (if any)
86
+ run: |
87
+ python -m pip install --upgrade pip
88
+ cd libs/kotaemon
89
+ pip install -U --upgrade-strategy eager -e .[all]
90
+
91
+ - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
92
+ if: |
93
+ steps.check-ignore-cache.outputs.check != 'true' &&
94
+ steps.check-cache-hit.outputs.check != 'true'
95
+ uses: actions/cache/save@v3
96
+ with:
97
+ path: ${{ env.pythonLocation }}
98
+ key: ${{ steps.restore-dependencies.outputs.cache-primary-key }}
99
+
100
+ - name: Install OS-based packages
101
+ run: |
102
+ sudo apt update -qqy
103
+ sudo apt install -y poppler-utils libpoppler-dev tesseract-ocr
104
+
105
+ - name: Test kotaemon with pytest
106
+ run: |
107
+ pip show pytest
108
+ cd libs/kotaemon
109
+ pytest
.gitignore ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
2
+ # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
3
+
4
+ activate*
5
+ activate/*
6
+ kotaemon-env*
7
+ .env
8
+
9
+ ### Emacs ###
10
+ # -*- mode: gitignore; -*-
11
+ *~
12
+ \#*\#
13
+ /.emacs.desktop
14
+ /.emacs.desktop.lock
15
+ *.elc
16
+ auto-save-list
17
+ tramp
18
+ .\#*
19
+
20
+ # Org-mode
21
+ .org-id-locations
22
+ *_archive
23
+
24
+ # flymake-mode
25
+ *_flymake.*
26
+
27
+ # eshell files
28
+ /eshell/history
29
+ /eshell/lastdir
30
+
31
+ # elpa packages
32
+ /elpa/
33
+
34
+ # reftex files
35
+ *.rel
36
+
37
+ # AUCTeX auto folder
38
+ /auto/
39
+
40
+ # cask packages
41
+ .cask/
42
+ dist/
43
+
44
+ # Flycheck
45
+ flycheck_*.el
46
+
47
+ # server auth directory
48
+ /server/
49
+
50
+ # projectiles files
51
+ .projectile
52
+
53
+ # directory configuration
54
+ .dir-locals.el
55
+
56
+ # network security
57
+ /network-security.data
58
+
59
+ ### Linux ###
60
+
61
+ # temporary files which can be created if a process still has a handle open of a deleted file
62
+ .fuse_hidden*
63
+
64
+ # KDE directory preferences
65
+ .directory
66
+
67
+ # Linux trash folder which might appear on any partition or disk
68
+ .Trash-*
69
+
70
+ # .nfs files are created when an open file is removed but is still being accessed
71
+ .nfs*
72
+
73
+ ### macOS ###
74
+ # General
75
+ .DS_Store
76
+ .AppleDouble
77
+ .LSOverride
78
+
79
+ # Icon must end with two \r
80
+ Icon
81
+
82
+ # Thumbnails
83
+ ._*
84
+
85
+ # Files that might appear in the root of a volume
86
+ .DocumentRevisions-V100
87
+ .fseventsd
88
+ .Spotlight-V100
89
+ .TemporaryItems
90
+ .Trashes
91
+ .VolumeIcon.icns
92
+ .com.apple.timemachine.donotpresent
93
+
94
+ # Directories potentially created on remote AFP share
95
+ .AppleDB
96
+ .AppleDesktop
97
+ Network Trash Folder
98
+ Temporary Items
99
+ .apdisk
100
+
101
+ ### macOS Patch ###
102
+ # iCloud generated files
103
+ *.icloud
104
+
105
+ ### PyCharm ###
106
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
107
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
108
+
109
+ # User-specific stuff
110
+ .idea/**/workspace.xml
111
+ .idea/**/tasks.xml
112
+ .idea/**/usage.statistics.xml
113
+ .idea/**/dictionaries
114
+ .idea/**/shelf
115
+
116
+ # AWS User-specific
117
+ .idea/**/aws.xml
118
+
119
+ # Generated files
120
+ .idea/**/contentModel.xml
121
+
122
+ # Sensitive or high-churn files
123
+ .idea/**/dataSources/
124
+ .idea/**/dataSources.ids
125
+ .idea/**/dataSources.local.xml
126
+ .idea/**/sqlDataSources.xml
127
+ .idea/**/dynamic.xml
128
+ .idea/**/uiDesigner.xml
129
+ .idea/**/dbnavigator.xml
130
+
131
+ # Gradle
132
+ .idea/**/gradle.xml
133
+ .idea/**/libraries
134
+
135
+ # Gradle and Maven with auto-import
136
+ # When using Gradle or Maven with auto-import, you should exclude module files,
137
+ # since they will be recreated, and may cause churn. Uncomment if using
138
+ # auto-import.
139
+ # .idea/artifacts
140
+ # .idea/compiler.xml
141
+ # .idea/jarRepositories.xml
142
+ # .idea/modules.xml
143
+ # .idea/*.iml
144
+ # .idea/modules
145
+ # *.iml
146
+ # *.ipr
147
+
148
+ # CMake
149
+ cmake-build-*/
150
+
151
+ # Mongo Explorer plugin
152
+ .idea/**/mongoSettings.xml
153
+
154
+ # File-based project format
155
+ *.iws
156
+
157
+ # IntelliJ
158
+ out/
159
+
160
+ # mpeltonen/sbt-idea plugin
161
+ .idea_modules/
162
+
163
+ # JIRA plugin
164
+ atlassian-ide-plugin.xml
165
+
166
+ # Cursive Clojure plugin
167
+ .idea/replstate.xml
168
+
169
+ # SonarLint plugin
170
+ .idea/sonarlint/
171
+
172
+ # Crashlytics plugin (for Android Studio and IntelliJ)
173
+ com_crashlytics_export_strings.xml
174
+ crashlytics.properties
175
+ crashlytics-build.properties
176
+ fabric.properties
177
+
178
+ # Editor-based Rest Client
179
+ .idea/httpRequests
180
+
181
+ # Android studio 3.1+ serialized cache file
182
+ .idea/caches/build_file_checksums.ser
183
+
184
+ ### PyCharm Patch ###
185
+ # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
186
+
187
+ # *.iml
188
+ # modules.xml
189
+ # .idea/misc.xml
190
+ # *.ipr
191
+
192
+ # Sonarlint plugin
193
+ # https://plugins.jetbrains.com/plugin/7973-sonarlint
194
+ .idea/**/sonarlint/
195
+
196
+ # SonarQube Plugin
197
+ # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
198
+ .idea/**/sonarIssues.xml
199
+
200
+ # Markdown Navigator plugin
201
+ # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
202
+ .idea/**/markdown-navigator.xml
203
+ .idea/**/markdown-navigator-enh.xml
204
+ .idea/**/markdown-navigator/
205
+
206
+ # Cache file creation bug
207
+ # See https://youtrack.jetbrains.com/issue/JBR-2257
208
+ .idea/$CACHE_FILE$
209
+
210
+ # CodeStream plugin
211
+ # https://plugins.jetbrains.com/plugin/12206-codestream
212
+ .idea/codestream.xml
213
+
214
+ # Azure Toolkit for IntelliJ plugin
215
+ # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
216
+ .idea/**/azureSettings.xml
217
+
218
+ ### Python ###
219
+ # Byte-compiled / optimized / DLL files
220
+ __pycache__/
221
+ *.py[cod]
222
+ *$py.class
223
+
224
+ # C extensions
225
+ *.so
226
+
227
+ # Distribution / packaging
228
+ .Python
229
+ build/
230
+ develop-eggs/
231
+ downloads/
232
+ eggs/
233
+ .eggs/
234
+ lib/
235
+ lib64/
236
+ parts/
237
+ sdist/
238
+ var/
239
+ wheels/
240
+ share/python-wheels/
241
+ *.egg-info/
242
+ .installed.cfg
243
+ *.egg
244
+ MANIFEST
245
+
246
+ # PyInstaller
247
+ # Usually these files are written by a python script from a template
248
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
249
+ *.manifest
250
+ *.spec
251
+
252
+ # Installer logs
253
+ pip-log.txt
254
+ pip-delete-this-directory.txt
255
+
256
+ # Unit test / coverage reports
257
+ htmlcov/
258
+ .tox/
259
+ .nox/
260
+ .coverage
261
+ .coverage.*
262
+ .cache
263
+ nosetests.xml
264
+ coverage.xml
265
+ *.cover
266
+ *.py,cover
267
+ .hypothesis/
268
+ .pytest_cache/
269
+ cover/
270
+
271
+ # Translations
272
+ *.mo
273
+ *.pot
274
+
275
+ # Django stuff:
276
+ *.log
277
+ local_settings.py
278
+ db.sqlite3
279
+ db.sqlite3-journal
280
+
281
+ # Flask stuff:
282
+ instance/
283
+ .webassets-cache
284
+
285
+ # Scrapy stuff:
286
+ .scrapy
287
+
288
+ # Sphinx documentation
289
+ docs/_build/
290
+
291
+ # PyBuilder
292
+ .pybuilder/
293
+ target/
294
+
295
+ # Jupyter Notebook
296
+ .ipynb_checkpoints
297
+
298
+ # IPython
299
+ profile_default/
300
+ ipython_config.py
301
+
302
+ # pyenv
303
+ # For a library or package, you might want to ignore these files since the code is
304
+ # intended to run in multiple environments; otherwise, check them in:
305
+ # .python-version
306
+
307
+ # pipenv
308
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
309
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
310
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
311
+ # install all needed dependencies.
312
+ #Pipfile.lock
313
+
314
+ # poetry
315
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
316
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
317
+ # commonly ignored for libraries.
318
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
319
+ #poetry.lock
320
+
321
+ # pdm
322
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
323
+ #pdm.lock
324
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
325
+ # in version control.
326
+ # https://pdm.fming.dev/#use-with-ide
327
+ .pdm.toml
328
+
329
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
330
+ __pypackages__/
331
+
332
+ # Celery stuff
333
+ celerybeat-schedule
334
+ celerybeat.pid
335
+
336
+ # SageMath parsed files
337
+ *.sage.py
338
+
339
+ # Environments
340
+ .venv
341
+ env/
342
+ venv/
343
+ ENV/
344
+ env.bak/
345
+ venv.bak/
346
+
347
+ # Spyder project settings
348
+ .spyderproject
349
+ .spyproject
350
+
351
+ # Rope project settings
352
+ .ropeproject
353
+
354
+ # mkdocs documentation
355
+ /site
356
+
357
+ # mypy
358
+ .mypy_cache/
359
+ .dmypy.json
360
+ dmypy.json
361
+
362
+ # Pyre type checker
363
+ .pyre/
364
+
365
+ # pytype static type analyzer
366
+ .pytype/
367
+
368
+ # Cython debug symbols
369
+ cython_debug/
370
+
371
+ # PyCharm
372
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
373
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
374
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
375
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
376
+ #.idea/
377
+
378
+ ### Python Patch ###
379
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
380
+ poetry.toml
381
+
382
+ # ruff
383
+ .ruff_cache/
384
+
385
+ # LSP config files
386
+ pyrightconfig.json
387
+
388
+ ### Vim ###
389
+ # Swap
390
+ [._]*.s[a-v][a-z]
391
+ !*.svg # comment out if you don't need vector files
392
+ [._]*.sw[a-p]
393
+ [._]s[a-rt-v][a-z]
394
+ [._]ss[a-gi-z]
395
+ [._]sw[a-p]
396
+
397
+ # Session
398
+ Session.vim
399
+ Sessionx.vim
400
+
401
+ # Temporary
402
+ .netrwhist
403
+ # Auto-generated tag files
404
+ tags
405
+ # Persistent undo
406
+ [._]*.un~
407
+
408
+ ### VisualStudioCode ###
409
+ .vscode/*
410
+ !.vscode/settings.json
411
+ !.vscode/tasks.json
412
+ !.vscode/launch.json
413
+ !.vscode/extensions.json
414
+ !.vscode/*.code-snippets
415
+
416
+ # Local History for Visual Studio Code
417
+ .history/
418
+
419
+ # Built Visual Studio Code Extensions
420
+ *.vsix
421
+
422
+ ### VisualStudioCode Patch ###
423
+ # Ignore all local history of files
424
+ .history
425
+ .ionide
426
+
427
+ ### Windows ###
428
+ # Windows thumbnail cache files
429
+ Thumbs.db
430
+ Thumbs.db:encryptable
431
+ ehthumbs.db
432
+ ehthumbs_vista.db
433
+
434
+ # Dump file
435
+ *.stackdump
436
+
437
+ # Folder config file
438
+ [Dd]esktop.ini
439
+
440
+ # Recycle Bin used on file shares
441
+ $RECYCLE.BIN/
442
+
443
+ # Windows Installer files
444
+ *.cab
445
+ *.msi
446
+ *.msix
447
+ *.msm
448
+ *.msp
449
+
450
+ # Windows shortcuts
451
+ *.lnk
452
+
453
+ # PDF files
454
+ *.pdf
455
+ !libs/kotaemon/tests/resources/*.pdf
456
+
457
+ .theflow/
458
+
459
+ # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
460
+ *.py[coid]
461
+
462
+ logs/
463
+ .gitsecret/keys/random_seed
464
+ !*.secret
465
+ .envrc
466
+ .env
467
+
468
+ S.gpg-agent*
469
+ .vscode/settings.json
470
+ examples/example1/assets
471
+ storage/*
472
+
473
+ # Conda and env storages
474
+ *install_dir/
475
+ doc_env/
476
+
477
+ # application data
478
+ ktem_app_data/
479
+ gradio_tmp/
.pre-commit-config.yaml ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.3.0
4
+ hooks:
5
+ - id: check-yaml
6
+ args: ["--unsafe"]
7
+ - id: check-toml
8
+ - id: end-of-file-fixer
9
+ - id: trailing-whitespace
10
+ - id: mixed-line-ending
11
+ - id: detect-aws-credentials
12
+ args: ["--allow-missing-credentials"]
13
+ - id: detect-private-key
14
+ - id: check-added-large-files
15
+ args: ["--maxkb=750"]
16
+ - id: debug-statements
17
+ - repo: https://github.com/ambv/black
18
+ rev: 22.3.0
19
+ hooks:
20
+ - id: black
21
+ language_version: python3
22
+ - repo: https://github.com/pycqa/isort
23
+ rev: 5.12.0
24
+ hooks:
25
+ - id: isort
26
+ args: ["--profile", "black"]
27
+ language_version: python3.10
28
+ - repo: https://github.com/pycqa/flake8
29
+ rev: 4.0.1
30
+ hooks:
31
+ - id: flake8
32
+ args: ["--max-line-length", "88", "--extend-ignore", "E203"]
33
+ - repo: https://github.com/myint/autoflake
34
+ rev: v1.4
35
+ hooks:
36
+ - id: autoflake
37
+ args:
38
+ [
39
+ "--in-place",
40
+ "--remove-unused-variables",
41
+ "--remove-all-unused-imports",
42
+ "--ignore-init-module-imports",
43
+ "--exclude=tests/*",
44
+ ]
45
+ - repo: https://github.com/pre-commit/mirrors-prettier
46
+ rev: v2.7.1
47
+ hooks:
48
+ - id: prettier
49
+ types_or: [markdown, yaml]
50
+ - repo: https://github.com/pre-commit/mirrors-mypy
51
+ rev: "v1.7.1"
52
+ hooks:
53
+ - id: mypy
54
+ additional_dependencies:
55
+ [
56
+ types-PyYAML==6.0.12.11,
57
+ "types-requests",
58
+ "sqlmodel",
59
+ "types-Markdown",
60
+ types-tzlocal,
61
+ ]
62
+ args: ["--check-untyped-defs", "--ignore-missing-imports"]
63
+ exclude: "^templates/"
64
+ - repo: https://github.com/codespell-project/codespell
65
+ rev: v2.2.4
66
+ hooks:
67
+ - id: codespell
68
+ additional_dependencies:
69
+ - tomli
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributor Covenant Code of Conduct
2
+
3
+ ## Our Pledge
4
+
5
+ We as members, contributors, and leaders pledge to make participation in our
6
+ community a harassment-free experience for everyone, regardless of age, body
7
+ size, visible or invisible disability, ethnicity, sex characteristics, gender
8
+ identity and expression, level of experience, education, socio-economic status,
9
+ nationality, personal appearance, race, religion, or sexual identity
10
+ and orientation.
11
+
12
+ We pledge to act and interact in ways that contribute to an open, welcoming,
13
+ diverse, inclusive, and healthy community.
14
+
15
+ ## Our Standards
16
+
17
+ Examples of behavior that contributes to a positive environment for our
18
+ community include:
19
+
20
+ - Demonstrating empathy and kindness toward other people
21
+ - Being respectful of differing opinions, viewpoints, and experiences
22
+ - Giving and gracefully accepting constructive feedback
23
+ - Accepting responsibility and apologizing to those affected by our mistakes,
24
+ and learning from the experience
25
+ - Focusing on what is best not just for us as individuals, but for the
26
+ overall community
27
+
28
+ Examples of unacceptable behavior include:
29
+
30
+ - The use of sexualized language or imagery, and sexual attention or
31
+ advances of any kind
32
+ - Trolling, insulting or derogatory comments, and personal or political attacks
33
+ - Public or private harassment
34
+ - Publishing others' private information, such as a physical or email
35
+ address, without their explicit permission
36
+ - Other conduct which could reasonably be considered inappropriate in a
37
+ professional setting
38
+
39
+ ## Enforcement Responsibilities
40
+
41
+ Community leaders are responsible for clarifying and enforcing our standards of
42
+ acceptable behavior and will take appropriate and fair corrective action in
43
+ response to any behavior that they deem inappropriate, threatening, offensive,
44
+ or harmful.
45
+
46
+ Community leaders have the right and responsibility to remove, edit, or reject
47
+ comments, commits, code, wiki edits, issues, and other contributions that are
48
+ not aligned to this Code of Conduct, and will communicate reasons for moderation
49
+ decisions when appropriate.
50
+
51
+ ## Scope
52
+
53
+ This Code of Conduct applies within all community spaces, and also applies when
54
+ an individual is officially representing the community in public spaces.
55
+ Examples of representing our community include using an official e-mail address,
56
+ posting via an official social media account, or acting as an appointed
57
+ representative at an online or offline event.
58
+
59
+ ## Enforcement
60
+
61
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
62
+ reported to the community leaders responsible for enforcement at
63
+ .
64
+ All complaints will be reviewed and investigated promptly and fairly.
65
+
66
+ All community leaders are obligated to respect the privacy and security of the
67
+ reporter of any incident.
68
+
69
+ ## Enforcement Guidelines
70
+
71
+ Community leaders will follow these Community Impact Guidelines in determining
72
+ the consequences for any action they deem in violation of this Code of Conduct:
73
+
74
+ ### 1. Correction
75
+
76
+ **Community Impact**: Use of inappropriate language or other behavior deemed
77
+ unprofessional or unwelcome in the community.
78
+
79
+ **Consequence**: A private, written warning from community leaders, providing
80
+ clarity around the nature of the violation and an explanation of why the
81
+ behavior was inappropriate. A public apology may be requested.
82
+
83
+ ### 2. Warning
84
+
85
+ **Community Impact**: A violation through a single incident or series
86
+ of actions.
87
+
88
+ **Consequence**: A warning with consequences for continued behavior. No
89
+ interaction with the people involved, including unsolicited interaction with
90
+ those enforcing the Code of Conduct, for a specified period of time. This
91
+ includes avoiding interactions in community spaces as well as external channels
92
+ like social media. Violating these terms may lead to a temporary or
93
+ permanent ban.
94
+
95
+ ### 3. Temporary Ban
96
+
97
+ **Community Impact**: A serious violation of community standards, including
98
+ sustained inappropriate behavior.
99
+
100
+ **Consequence**: A temporary ban from any sort of interaction or public
101
+ communication with the community for a specified period of time. No public or
102
+ private interaction with the people involved, including unsolicited interaction
103
+ with those enforcing the Code of Conduct, is allowed during this period.
104
+ Violating these terms may lead to a permanent ban.
105
+
106
+ ### 4. Permanent Ban
107
+
108
+ **Community Impact**: Demonstrating a pattern of violation of community
109
+ standards, including sustained inappropriate behavior, harassment of an
110
+ individual, or aggression toward or disparagement of classes of individuals.
111
+
112
+ **Consequence**: A permanent ban from any sort of public interaction within
113
+ the community.
114
+
115
+ ## Attribution
116
+
117
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118
+ version 2.0, available at
119
+ https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120
+
121
+ Community Impact Guidelines were inspired by [Mozilla's code of conduct
122
+ enforcement ladder](https://github.com/mozilla/diversity).
123
+
124
+ [homepage]: https://www.contributor-covenant.org
125
+
126
+ For answers to common questions about this code of conduct, see the FAQ at
127
+ https://www.contributor-covenant.org/faq. Translations are available at
128
+ https://www.contributor-covenant.org/translations.
CONTRIBUTING.md ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to Kotaemon
2
+
3
+ Welcome 👋 to the Kotaemon project! We're thrilled that you're interested in contributing. Whether you're fixing bugs, adding new features, or improving documentation, your efforts are highly appreciated. This guide aims to help you get started with contributing to Kotaemon.
4
+
5
+ <a href="https://github.com/Cinnamon/kotaemon/graphs/contributors">
6
+ <img src="https://contrib.rocks/image?repo=Cinnamon/kotaemon" />
7
+ </a>
8
+
9
+ ### Table of Contents
10
+
11
+ 1. [📖 Code of Conduct](#code-of-conduct)
12
+ 2. [🔁 Contributing via Pull Requests](#contributing-via-pull-requests)
13
+ 3. [📥 Opening an Issue](#-opening-an-issue)
14
+ 4. [📝 Commit Messages](#-commit-messages)
15
+ 5. [🧾 License](#-license)
16
+
17
+ ## 📖 Code of Conduct
18
+
19
+ Please review our [code of conduct](./CODE_OF_CONDUCT.md), which is in effect at all times. We expect everyone who contributes to this project to honor it.
20
+
21
+ ## 🔁 Contributing via Pull Requests
22
+
23
+ 1. [**Fork the repository**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo): Click on the [Fork](https://github.com/Cinnamon/kotaemon/fork) button on the repository's page to create a copy of Kotaemon under your GitHub account.
24
+
25
+ 2. [**Clone your code**](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository): Clone your forked repository to your local machine.
26
+
27
+ 3. [**Create new branch**](https://docs.github.com/en/desktop/making-changes-in-a-branch/managing-branches-in-github-desktop): Create a new branch in your forked repo with a descriptive name that reflects your changes.
28
+
29
+ ```sh
30
+ git checkout -b descriptive-name-for-your-changes
31
+ ```
32
+
33
+ 4. **Setup the development environment**: If you are working on the code, make sure to install the necessary dependencies for development
34
+
35
+ ```sh
36
+ pip install -e "libs/kotaemon[dev]
37
+ ```
38
+
39
+ 5. **Make your changes**: Ensure your code follows the project's coding style and passes all test cases.
40
+
41
+ - Check the coding style
42
+
43
+ ```sh
44
+ pre-commit run --all-files
45
+ ```
46
+
47
+ - Run the tests
48
+
49
+ ```sh
50
+ pytest libs/kotaemon/tests/
51
+ ```
52
+
53
+ 6. [**Commit your changes**](https://docs.github.com/en/desktop/making-changes-in-a-branch/committing-and-reviewing-changes-to-your-project-in-github-desktop): Once you are done with your changes, add and commit them with clear messages.
54
+
55
+ ```sh
56
+ git add your_changes.py
57
+ git commit -m "clear message described your changes."
58
+ git push -u origin descriptive-name-for-your-changes
59
+ ```
60
+
61
+ 7. [**Create a pull request**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request): When you are satisfied with your changes, [submit a pull request](https://github.com/Cinnamon/kotaemon/compare) from your forked repository to Kotaemon repository. In the pull request, provide a clear description of your changes and any related issues. For the title of the pull request, please refer to our [commit messages convention](#-commit-messages).
62
+
63
+ 8. **Wait for reviews**: Wait for the maintainers to review your pull request. If everything is okay, your changes will be merged into the Kotaemon project.
64
+
65
+ ### GitHub Actions CI Tests
66
+
67
+ All pull requests must pass the [GitHub Actions Continuous Integration (CI)](https://docs.github.com/en/actions/about-github-actions/about-continuous-integration-with-github-actions) tests before they can be merged. These tests include coding-style checks, PR title validation, unit tests, etc. to ensure that your changes meet the project's quality standards. Please review and fix any CI failures that arise.
68
+
69
+ ## 📥 Opening an Issue
70
+
71
+ Before [creating an issues](https://github.com/Cinnamon/kotaemon/issues/new/choose), search through existing issues to ensure you are not opening a duplicate. If you are reporting a bug or issue, please provide a reproducible example to help us quickly identify the problem.
72
+
73
+ ## 📝 Commit Messages
74
+
75
+ ### Overview
76
+
77
+ We use [Angular convention](https://www.conventionalcommits.org/en/) for commit messages to maintain consistency and clarity in our project history. Please take a moment to familiarize yourself with this convention before making your first commit.
78
+
79
+ _For the sake of simplicity, we use [squashing merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits) with pull requests. Therefore, if you contribute via a pull request, just make sure your PR's title, instead of the whole commits, follows this convention._
80
+
81
+ Commit format:
82
+
83
+ ```sh
84
+ <gitmoji> <type>(<scope>): <subject>
85
+ <BLANK LINE>
86
+ <body>
87
+ <BLANK LINE>
88
+ <footer>
89
+ ```
90
+
91
+ Examples:
92
+
93
+ ```sh
94
+ docs(api): update api doc
95
+ ```
96
+
97
+ ### Commit types
98
+
99
+ | Types | Description |
100
+ | :--------- | :------------------------------------------------------------ |
101
+ | `feat` | New features |
102
+ | `fix` | Bug fix |
103
+ | `docs` | Documentation only changes |
104
+ | `build` | Changes that affect the build system or external dependencies |
105
+ | `chore` | Something that doesn’t fit the other types |
106
+ | `ci` | Changes to our CI configuration files and scripts |
107
+ | `perf` | Improve performance |
108
+ | `refactor` | Refactor code |
109
+ | `revert` | Revert a previous commit |
110
+ | `style` | Improve structure/format of the code |
111
+ | `test` | Add, update or pass tests |
112
+
113
+ ## 🧾 License
114
+
115
+ All contributions will be licensed under the project's license: [Apache License 2.0](https://github.com/Cinnamon/kotaemon/blob/main/LICENSE.txt).
Dockerfile ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Lite version
2
+ FROM python:3.10-slim AS lite
3
+
4
+ # Common dependencies
5
+ RUN apt-get update -qqy && \
6
+ apt-get install -y --no-install-recommends \
7
+ ssh \
8
+ git \
9
+ gcc \
10
+ g++ \
11
+ poppler-utils \
12
+ libpoppler-dev \
13
+ unzip \
14
+ curl \
15
+ cargo
16
+
17
+ # Setup args
18
+ ARG TARGETPLATFORM
19
+ ARG TARGETARCH
20
+
21
+ # Set environment variables
22
+ ENV PYTHONDONTWRITEBYTECODE=1
23
+ ENV PYTHONUNBUFFERED=1
24
+ ENV PYTHONIOENCODING=UTF-8
25
+ ENV TARGETARCH=${TARGETARCH}
26
+
27
+ # Create working directory
28
+ WORKDIR /app
29
+
30
+ # Download pdfjs
31
+ COPY scripts/download_pdfjs.sh /app/scripts/download_pdfjs.sh
32
+ RUN chmod +x /app/scripts/download_pdfjs.sh
33
+ ENV PDFJS_PREBUILT_DIR="/app/libs/ktem/ktem/assets/prebuilt/pdfjs-dist"
34
+ RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR
35
+
36
+ # Copy contents
37
+ COPY . /app
38
+ COPY .env.example /app/.env
39
+
40
+ # Install pip packages
41
+ RUN --mount=type=ssh \
42
+ --mount=type=cache,target=/root/.cache/pip \
43
+ pip install -e "libs/kotaemon" \
44
+ && pip install -e "libs/ktem" \
45
+ && pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
46
+
47
+ RUN --mount=type=ssh \
48
+ --mount=type=cache,target=/root/.cache/pip \
49
+ if [ "$TARGETARCH" = "amd64" ]; then pip install "graphrag<=0.3.6" future; fi
50
+
51
+ # Clean up
52
+ RUN apt-get autoremove \
53
+ && apt-get clean \
54
+ && rm -rf /var/lib/apt/lists/* \
55
+ && rm -rf ~/.cache
56
+
57
+ CMD ["python", "app.py"]
58
+
59
+ # Full version
60
+ FROM lite AS full
61
+
62
+ # Additional dependencies for full version
63
+ RUN apt-get update -qqy && \
64
+ apt-get install -y --no-install-recommends \
65
+ tesseract-ocr \
66
+ tesseract-ocr-jpn \
67
+ libsm6 \
68
+ libxext6 \
69
+ libreoffice \
70
+ ffmpeg \
71
+ libmagic-dev
72
+
73
+ # Install torch and torchvision for unstructured
74
+ RUN --mount=type=ssh \
75
+ --mount=type=cache,target=/root/.cache/pip \
76
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
77
+
78
+ # Install additional pip packages
79
+ RUN --mount=type=ssh \
80
+ --mount=type=cache,target=/root/.cache/pip \
81
+ pip install -e "libs/kotaemon[adv]" \
82
+ && pip install unstructured[all-docs]
83
+
84
+ # Install lightRAG
85
+ ENV USE_LIGHTRAG=true
86
+ RUN --mount=type=ssh \
87
+ --mount=type=cache,target=/root/.cache/pip \
88
+ pip install aioboto3 nano-vectordb ollama xxhash lightrag-hku
89
+
90
+ # Clean up
91
+ RUN apt-get autoremove \
92
+ && apt-get clean \
93
+ && rm -rf /var/lib/apt/lists/* \
94
+ && rm -rf ~/.cache
95
+
96
+ # Download nltk packages as required for unstructured
97
+ RUN python -c "from unstructured.nlp.tokenize import _download_nltk_packages_if_not_present; _download_nltk_packages_if_not_present()"
98
+
99
+ CMD ["python", "app.py"]
LICENSE.txt ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
README.md CHANGED
@@ -1,12 +1,369 @@
1
  ---
2
- title: Renesis
3
- emoji: 🏢
4
- colorFrom: gray
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.5.0
8
  app_file: app.py
9
- pinned: false
 
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: renesis
 
 
 
 
 
3
  app_file: app.py
4
+ sdk: gradio
5
+ sdk_version: 4.39.0
6
  ---
7
+ <div align="center">
8
+
9
+ # kotaemon
10
+
11
+ An open-source clean & customizable RAG UI for chatting with your documents. Built with both end users and
12
+ developers in mind.
13
+
14
+ ![Preview](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview-graph.png)
15
+
16
+ <a href="https://trendshift.io/repositories/11607" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11607" alt="Cinnamon%2Fkotaemon | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
17
+
18
+ [Live Demo](https://huggingface.co/spaces/cin-model/kotaemon-demo) |
19
+ [Online Install](https://cinnamon.github.io/kotaemon/online_install/) |
20
+ [User Guide](https://cinnamon.github.io/kotaemon/) |
21
+ [Developer Guide](https://cinnamon.github.io/kotaemon/development/) |
22
+ [Feedback](https://github.com/Cinnamon/kotaemon/issues) |
23
+ [Contact](mailto:[email protected])
24
+
25
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-31013/)
26
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
27
+ <a href="https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon" target="_blank">
28
+ <img src="https://img.shields.io/badge/docker_pull-kotaemon:latest-brightgreen" alt="docker pull ghcr.io/cinnamon/kotaemon:latest"></a>
29
+ ![download](https://img.shields.io/github/downloads/Cinnamon/kotaemon/total.svg?label=downloads&color=blue)
30
+ <a href='https://huggingface.co/spaces/cin-model/kotaemon-demo'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
31
+ <a href="https://hellogithub.com/en/repository/d3141471a0244d5798bc654982b263eb" target="_blank"><img src="https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=d3141471a0244d5798bc654982b263eb&claim_uid=RLiD9UZ1rEHNaMf&theme=small" alt="Featured|HelloGitHub" /></a>
32
+
33
+ </div>
34
+
35
+ ## Introduction
36
+
37
+ This project serves as a functional RAG UI for both end users who want to do QA on their
38
+ documents and developers who want to build their own RAG pipeline.
39
+ <br>
40
+
41
+ ```yml
42
+ +----------------------------------------------------------------------------+
43
+ | End users: Those who use apps built with `kotaemon`. |
44
+ | (You use an app like the one in the demo above) |
45
+ | +----------------------------------------------------------------+ |
46
+ | | Developers: Those who built with `kotaemon`. | |
47
+ | | (You have `import kotaemon` somewhere in your project) | |
48
+ | | +----------------------------------------------------+ | |
49
+ | | | Contributors: Those who make `kotaemon` better. | | |
50
+ | | | (You make PR to this repo) | | |
51
+ | | +----------------------------------------------------+ | |
52
+ | +----------------------------------------------------------------+ |
53
+ +----------------------------------------------------------------------------+
54
+ ```
55
+
56
+ ### For end users
57
+
58
+ - **Clean & Minimalistic UI**: A user-friendly interface for RAG-based QA.
59
+ - **Support for Various LLMs**: Compatible with LLM API providers (OpenAI, AzureOpenAI, Cohere, etc.) and local LLMs (via `ollama` and `llama-cpp-python`).
60
+ - **Easy Installation**: Simple scripts to get you started quickly.
61
+
62
+ ### For developers
63
+
64
+ - **Framework for RAG Pipelines**: Tools to build your own RAG-based document QA pipeline.
65
+ - **Customizable UI**: See your RAG pipeline in action with the provided UI, built with <a href='https://github.com/gradio-app/gradio'>Gradio <img src='https://img.shields.io/github/stars/gradio-app/gradio'></a>.
66
+ - **Gradio Theme**: If you use Gradio for development, check out our theme here: [kotaemon-gradio-theme](https://github.com/lone17/kotaemon-gradio-theme).
67
+
68
+ ## Key Features
69
+
70
+ - **Host your own document QA (RAG) web-UI**: Support multi-user login, organize your files in private/public collections, collaborate and share your favorite chat with others.
71
+
72
+ - **Organize your LLM & Embedding models**: Support both local LLMs & popular API providers (OpenAI, Azure, Ollama, Groq).
73
+
74
+ - **Hybrid RAG pipeline**: Sane default RAG pipeline with hybrid (full-text & vector) retriever and re-ranking to ensure best retrieval quality.
75
+
76
+ - **Multi-modal QA support**: Perform Question Answering on multiple documents with figures and tables support. Support multi-modal document parsing (selectable options on UI).
77
+
78
+ - **Advanced citations with document preview**: By default the system will provide detailed citations to ensure the correctness of LLM answers. View your citations (incl. relevant score) directly in the _in-browser PDF viewer_ with highlights. Warning when retrieval pipeline return low relevant articles.
79
+
80
+ - **Support complex reasoning methods**: Use question decomposition to answer your complex/multi-hop question. Support agent-based reasoning with `ReAct`, `ReWOO` and other agents.
81
+
82
+ - **Configurable settings UI**: You can adjust most important aspects of retrieval & generation process on the UI (incl. prompts).
83
+
84
+ - **Extensible**: Being built on Gradio, you are free to customize or add any UI elements as you like. Also, we aim to support multiple strategies for document indexing & retrieval. `GraphRAG` indexing pipeline is provided as an example.
85
+
86
+ ![Preview](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview.png)
87
+
88
+ ## Installation
89
+
90
+ > If you are not a developer and just want to use the app, please check out our easy-to-follow [User Guide](https://cinnamon.github.io/kotaemon/). Download the `.zip` file from the [latest release](https://github.com/Cinnamon/kotaemon/releases/latest) to get all the newest features and bug fixes.
91
+
92
+ ### System requirements
93
+
94
+ 1. [Python](https://www.python.org/downloads/) >= 3.10
95
+ 2. [Docker](https://www.docker.com/): optional, if you [install with Docker](#with-docker-recommended)
96
+ 3. [Unstructured](https://docs.unstructured.io/open-source/installation/full-installation#full-installation) if you want to process files other than `.pdf`, `.html`, `.mhtml`, and `.xlsx` documents. Installation steps differ depending on your operating system. Please visit the link and follow the specific instructions provided there.
97
+
98
+ ### With Docker (recommended)
99
+
100
+ 1. We support both `lite` & `full` version of Docker images. With `full`, the extra packages of `unstructured` will be installed as well, it can support additional file types (`.doc`, `.docx`, ...) but the cost is larger docker image size. For most users, the `lite` image should work well in most cases.
101
+
102
+ - To use the `lite` version.
103
+
104
+ ```bash
105
+ docker run \
106
+ -e GRADIO_SERVER_NAME=0.0.0.0 \
107
+ -e GRADIO_SERVER_PORT=7860 \
108
+ -p 7860:7860 -it --rm \
109
+ ghcr.io/cinnamon/kotaemon:main-lite
110
+ ```
111
+
112
+ - To use the `full` version.
113
+
114
+ ```bash
115
+ docker run \
116
+ -e GRADIO_SERVER_NAME=0.0.0.0 \
117
+ -e GRADIO_SERVER_PORT=7860 \
118
+ -p 7860:7860 -it --rm \
119
+ ghcr.io/cinnamon/kotaemon:main-full
120
+ ```
121
+
122
+ 2. We currently support and test two platforms: `linux/amd64` and `linux/arm64` (for newer Mac). You can specify the platform by passing `--platform` in the `docker run` command. For example:
123
+
124
+ ```bash
125
+ # To run docker with platform linux/arm64
126
+ docker run \
127
+ -e GRADIO_SERVER_NAME=0.0.0.0 \
128
+ -e GRADIO_SERVER_PORT=7860 \
129
+ -p 7860:7860 -it --rm \
130
+ --platform linux/arm64 \
131
+ ghcr.io/cinnamon/kotaemon:main-lite
132
+ ```
133
+
134
+ 3. Once everything is set up correctly, you can go to `http://localhost:7860/` to access the WebUI.
135
+
136
+ 4. We use [GHCR](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) to store docker images, all images can be found [here.](https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon)
137
+
138
+ ### Without Docker
139
+
140
+ 1. Clone and install required packages on a fresh python environment.
141
+
142
+ ```shell
143
+ # optional (setup env)
144
+ conda create -n kotaemon python=3.10
145
+ conda activate kotaemon
146
+
147
+ # clone this repo
148
+ git clone https://github.com/Cinnamon/kotaemon
149
+ cd kotaemon
150
+
151
+ pip install -e "libs/kotaemon[all]"
152
+ pip install -e "libs/ktem"
153
+ ```
154
+
155
+ 2. Create a `.env` file in the root of this project. Use `.env.example` as a template
156
+
157
+ The `.env` file is there to serve use cases where users want to pre-config the models before starting up the app (e.g. deploy the app on HF hub). The file will only be used to populate the db once upon the first run, it will no longer be used in consequent runs.
158
+
159
+ 3. (Optional) To enable in-browser `PDF_JS` viewer, download [PDF_JS_DIST](https://github.com/mozilla/pdf.js/releases/download/v4.0.379/pdfjs-4.0.379-dist.zip) then extract it to `libs/ktem/ktem/assets/prebuilt`
160
+
161
+ <img src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/pdf-viewer-setup.png" alt="pdf-setup" width="300">
162
+
163
+ 4. Start the web server:
164
+
165
+ ```shell
166
+ python app.py
167
+ ```
168
+
169
+ - The app will be automatically launched in your browser.
170
+ - Default username and password are both `admin`. You can set up additional users directly through the UI.
171
+
172
+ ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png)
173
+
174
+ 5. Check the `Resources` tab and `LLMs and Embeddings` and ensure that your `api_key` value is set correctly from your `.env` file. If it is not set, you can set it there.
175
+
176
+ ### Setup GraphRAG
177
+
178
+ > [!NOTE]
179
+ > Official MS GraphRAG indexing only works with OpenAI or Ollama API.
180
+ > We recommend most users to use NanoGraphRAG implementation for straightforward integration with Kotaemon.
181
+
182
+ <details>
183
+
184
+ <summary>Setup Nano GRAPHRAG</summary>
185
+
186
+ - Install nano-GraphRAG: `pip install nano-graphrag`
187
+ - `nano-graphrag` install might introduce version conflicts, see [this issue](https://github.com/Cinnamon/kotaemon/issues/440)
188
+ - To quickly fix: `pip uninstall hnswlib chroma-hnswlib && pip install chroma-hnswlib`
189
+ - Launch Kotaemon with `USE_NANO_GRAPHRAG=true` environment variable.
190
+ - Set your default LLM & Embedding models in Resources setting and it will be recognized automatically from NanoGraphRAG.
191
+
192
+ </details>
193
+
194
+ <details>
195
+
196
+ <summary>Setup LIGHTRAG</summary>
197
+
198
+ - Install LightRAG: `pip install git+https://github.com/HKUDS/LightRAG.git`
199
+ - `LightRAG` install might introduce version conflicts, see [this issue](https://github.com/Cinnamon/kotaemon/issues/440)
200
+ - To quickly fix: `pip uninstall hnswlib chroma-hnswlib && pip install chroma-hnswlib`
201
+ - Launch Kotaemon with `USE_LIGHTRAG=true` environment variable.
202
+ - Set your default LLM & Embedding models in Resources setting and it will be recognized automatically from LightRAG.
203
+
204
+ </details>
205
+
206
+ <details>
207
+
208
+ <summary>Setup MS GRAPHRAG</summary>
209
+
210
+ - **Non-Docker Installation**: If you are not using Docker, install GraphRAG with the following command:
211
+
212
+ ```shell
213
+ pip install "graphrag<=0.3.6" future
214
+ ```
215
+
216
+ - **Setting Up API KEY**: To use the GraphRAG retriever feature, ensure you set the `GRAPHRAG_API_KEY` environment variable. You can do this directly in your environment or by adding it to a `.env` file.
217
+ - **Using Local Models and Custom Settings**: If you want to use GraphRAG with local models (like `Ollama`) or customize the default LLM and other configurations, set the `USE_CUSTOMIZED_GRAPHRAG_SETTING` environment variable to true. Then, adjust your settings in the `settings.yaml.example` file.
218
+
219
+ </details>
220
+
221
+ ### Setup Local Models (for local/private RAG)
222
+
223
+ See [Local model setup](docs/local_model.md).
224
+
225
+ ### Customize your application
226
+
227
+ - By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.
228
+
229
+ - For advanced users or specific use cases, you can customize these files:
230
+
231
+ - `flowsettings.py`
232
+ - `.env`
233
+
234
+ #### `flowsettings.py`
235
+
236
+ This file contains the configuration of your application. You can use the example
237
+ [here](flowsettings.py) as the starting point.
238
+
239
+ <details>
240
+
241
+ <summary>Notable settings</summary>
242
+
243
+ ```python
244
+ # setup your preferred document store (with full-text search capabilities)
245
+ KH_DOCSTORE=(Elasticsearch | LanceDB | SimpleFileDocumentStore)
246
+
247
+ # setup your preferred vectorstore (for vector-based search)
248
+ KH_VECTORSTORE=(ChromaDB | LanceDB | InMemory | Qdrant)
249
+
250
+ # Enable / disable multimodal QA
251
+ KH_REASONINGS_USE_MULTIMODAL=True
252
+
253
+ # Setup your new reasoning pipeline or modify existing one.
254
+ KH_REASONINGS = [
255
+ "ktem.reasoning.simple.FullQAPipeline",
256
+ "ktem.reasoning.simple.FullDecomposeQAPipeline",
257
+ "ktem.reasoning.react.ReactAgentPipeline",
258
+ "ktem.reasoning.rewoo.RewooAgentPipeline",
259
+ ]
260
+ ```
261
+
262
+ </details>
263
+
264
+ #### `.env`
265
+
266
+ This file provides another way to configure your models and credentials.
267
+
268
+ <details>
269
+
270
+ <summary>Configure model via the .env file</summary>
271
+
272
+ - Alternatively, you can configure the models via the `.env` file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.
273
+
274
+ - Currently, the following providers are supported:
275
+
276
+ - **OpenAI**
277
+
278
+ In the `.env` file, set the `OPENAI_API_KEY` variable with your OpenAI API key in order
279
+ to enable access to OpenAI's models. There are other variables that can be modified,
280
+ please feel free to edit them to fit your case. Otherwise, the default parameter should
281
+ work for most people.
282
+
283
+ ```shell
284
+ OPENAI_API_BASE=https://api.openai.com/v1
285
+ OPENAI_API_KEY=<your OpenAI API key here>
286
+ OPENAI_CHAT_MODEL=gpt-3.5-turbo
287
+ OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
288
+ ```
289
+
290
+ - **Azure OpenAI**
291
+
292
+ For OpenAI models via Azure platform, you need to provide your Azure endpoint and API
293
+ key. Your might also need to provide your developments' name for the chat model and the
294
+ embedding model depending on how you set up Azure development.
295
+
296
+ ```shell
297
+ AZURE_OPENAI_ENDPOINT=
298
+ AZURE_OPENAI_API_KEY=
299
+ OPENAI_API_VERSION=2024-02-15-preview
300
+ AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
301
+ AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002
302
+ ```
303
+
304
+ - **Local Models**
305
+
306
+ - Using `ollama` OpenAI compatible server:
307
+
308
+ - Install [ollama](https://github.com/ollama/ollama) and start the application.
309
+
310
+ - Pull your model, for example:
311
+
312
+ ```shell
313
+ ollama pull llama3.1:8b
314
+ ollama pull nomic-embed-text
315
+ ```
316
+
317
+ - Set the model names on web UI and make it as default:
318
+
319
+ ![Models](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/models.png)
320
+
321
+ - Using `GGUF` with `llama-cpp-python`
322
+
323
+ You can search and download a LLM to be ran locally from the [Hugging Face Hub](https://huggingface.co/models). Currently, these model formats are supported:
324
+
325
+ - GGUF
326
+
327
+ You should choose a model whose size is less than your device's memory and should leave
328
+ about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available,
329
+ then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to
330
+ give better generation but also take more processing time.
331
+
332
+ Here are some recommendations and their size in memory:
333
+
334
+ - [Qwen1.5-1.8B-Chat-GGUF](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q8_0.gguf?download=true): around 2 GB
335
+
336
+ Add a new LlamaCpp model with the provided model name on the web UI.
337
+
338
+ </details>
339
+
340
+ ### Adding your own RAG pipeline
341
+
342
+ #### Custom Reasoning Pipeline
343
+
344
+ 1. Check the default pipeline implementation in [here](libs/ktem/ktem/reasoning/simple.py). You can make quick adjustment to how the default QA pipeline work.
345
+ 2. Add new `.py` implementation in `libs/ktem/ktem/reasoning/` and later include it in `flowssettings` to enable it on the UI.
346
+
347
+ #### Custom Indexing Pipeline
348
+
349
+ - Check sample implementation in `libs/ktem/ktem/index/file/graph`
350
+
351
+ > (more instruction WIP).
352
+
353
+ ## Star History
354
+
355
+ <a href="https://star-history.com/#Cinnamon/kotaemon&Date">
356
+ <picture>
357
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Cinnamon/kotaemon&type=Date&theme=dark" />
358
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Cinnamon/kotaemon&type=Date" />
359
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Cinnamon/kotaemon&type=Date" />
360
+ </picture>
361
+ </a>
362
+
363
+ ## Contribution
364
+
365
+ Since our project is actively being developed, we greatly value your feedback and contributions. Please see our [Contributing Guide](https://github.com/Cinnamon/kotaemon/blob/main/CONTRIBUTING.md) to get started. Thank you to all our contributors!
366
 
367
+ <a href="https://github.com/Cinnamon/kotaemon/graphs/contributors">
368
+ <img src="https://contrib.rocks/image?repo=Cinnamon/kotaemon" />
369
+ </a>
app.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from theflow.settings import settings as flowsettings
4
+
5
+ KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".")
6
+ GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None)
7
+ # override GRADIO_TEMP_DIR if it's not set
8
+ if GRADIO_TEMP_DIR is None:
9
+ GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp")
10
+ os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
11
+
12
+
13
+ from ktem.main import App # noqa
14
+
15
+ app = App()
16
+ demo = app.make()
17
+ demo.queue().launch(
18
+ share=True,
19
+ favicon_path=app._favicon,
20
+ inbrowser=True,
21
+ allowed_paths=[
22
+ "libs/ktem/ktem/assets",
23
+ GRADIO_TEMP_DIR,
24
+ ],
25
+ )
doc_env_reqs.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ mkdocs
2
+ mkdocstrings[python]
3
+ mkdocs-material
4
+ mkdocs-gen-files
5
+ mkdocs-literate-nav
6
+ mkdocs-video
7
+ mkdocs-git-revision-date-localized-plugin
8
+ mkdocs-section-index
9
+ mdx_truly_sane_lists
docs/about.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # About Kotaemon
2
+
3
+ An open-source tool for chatting with your documents. Built with both end users and
4
+ developers in mind.
5
+
6
+ [Source Code](https://github.com/Cinnamon/kotaemon) |
7
+ [Live Demo](https://huggingface.co/spaces/cin-model/kotaemon-demo)
8
+
9
+ [User Guide](https://cinnamon.github.io/kotaemon/) |
10
+ [Developer Guide](https://cinnamon.github.io/kotaemon/development/) |
11
+ [Feedback](https://github.com/Cinnamon/kotaemon/issues)
12
+
13
+ [Dark Mode](?__theme=dark) |
14
+ [Light Mode](?__theme=light)
docs/development/contributing.md ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing
2
+
3
+ ## Setting up
4
+
5
+ - Clone the repo
6
+
7
+ ```shell
8
+ git clone [email protected]:Cinnamon/kotaemon.git
9
+ cd kotaemon
10
+ ```
11
+
12
+ - Install the environment
13
+
14
+ - Create a conda environment (python >= 3.10 is recommended)
15
+
16
+ ```shell
17
+ conda create -n kotaemon python=3.10
18
+ conda activate kotaemon
19
+
20
+ # install dependencies
21
+ cd libs/kotaemon
22
+ pip install -e ".[all]"
23
+ ```
24
+
25
+ - Or run the installer (one of the `scripts/run_*` scripts depends on your OS), then
26
+ you will have all the dependencies installed as a conda environment at
27
+ `install_dir/env`.
28
+
29
+ ```shell
30
+ conda activate install_dir/env
31
+ ```
32
+
33
+ - Pre-commit
34
+
35
+ ```shell
36
+ pre-commit install
37
+ ```
38
+
39
+ - Test
40
+
41
+ ```shell
42
+ pytest tests
43
+ ```
44
+
45
+ ## Package overview
46
+
47
+ `kotaemon` library focuses on the AI building blocks to implement a RAG-based QA application. It consists of base interfaces, core components and a list of utilities:
48
+
49
+ - Base interfaces: `kotaemon` defines the base interface of a component in a pipeline. A pipeline is also a component. By clearly define this interface, a pipeline of steps can be easily constructed and orchestrated.
50
+ - Core components: `kotaemon` implements (or wraps 3rd-party libraries
51
+ like Langchain, llama-index,... when possible) commonly used components in
52
+ kotaemon use cases. Some of these components are: LLM, vector store,
53
+ document store, retriever... For a detailed list and description of these
54
+ components, please refer to the [API Reference](../reference/Summary.md) section.
55
+ - List of utilities: `kotaemon` provides utilities and tools that are
56
+ usually needed in client project. For example, it provides a prompt
57
+ engineering UI for AI developers in a project to quickly create a prompt
58
+ engineering tool for DMs and QALs. It also provides a command to quickly spin
59
+ up a project code base. For a full list and description of these utilities,
60
+ please refer to the [Utilities](utilities.md) section.
61
+
62
+ ```mermaid
63
+ mindmap
64
+ root((kotaemon))
65
+ Base Interfaces
66
+ Document
67
+ LLMInterface
68
+ RetrievedDocument
69
+ BaseEmbeddings
70
+ BaseChat
71
+ BaseCompletion
72
+ ...
73
+ Core Components
74
+ LLMs
75
+ AzureOpenAI
76
+ OpenAI
77
+ Embeddings
78
+ AzureOpenAI
79
+ OpenAI
80
+ HuggingFaceEmbedding
81
+ VectorStore
82
+ InMemoryVectorstore
83
+ ChromaVectorstore
84
+ Agent
85
+ Tool
86
+ DocumentStore
87
+ ...
88
+ Utilities
89
+ Scaffold project
90
+ PromptUI
91
+ Documentation Support
92
+ ```
93
+
94
+ ## Common conventions
95
+
96
+ - PR title: One-line description (example: Feat: Declare BaseComponent and decide LLM call interface).
97
+ - [Encouraged] Provide a quick description in the PR, so that:
98
+ - Reviewers can quickly understand the direction of the PR.
99
+ - It will be included in the commit message when the PR is merged.
100
+
101
+ ## Environment caching on PR
102
+
103
+ - To speed up CI, environments are cached based on the version specified in `__init__.py`.
104
+ - Since dependencies versions in `setup.py` are not pinned, you need to pump the version in order to use a new environment. That environment will then be cached and used by your subsequence commits within the PR, until you pump the version again
105
+ - The new environment created during your PR is cached and will be available to others once the PR is merged.
106
+ - If you are experimenting with new dependencies and want a fresh environment every time, add `[ignore cache]` in your commit message. The CI will create a fresh environment to run your commit and then discard it.
107
+ - If your PR include updated dependencies, the recommended workflow would be:
108
+ - Doing development as usual.
109
+ - When you want to run the CI, push a commit with the message containing `[ignore cache]`.
110
+ - Once the PR is final, pump the version in `__init__.py` and push a final commit not containing `[ignore cache]`.
111
+
112
+ ## Merge PR guideline
113
+
114
+ - Use squash and merge option
115
+ - 1st line message is the PR title.
116
+ - The text area is the PR description.
docs/development/create-a-component.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creating a component
2
+
3
+ A fundamental concept in kotaemon is "component".
4
+
5
+ Anything that isn't data or data structure is a "component". A component can be
6
+ thought of as a step within a pipeline. It takes in some input, processes it,
7
+ and returns an output, just the same as a Python function! The output will then
8
+ become an input for the next component in a pipeline. In fact, a pipeline is just
9
+ a component. More appropriately, a nested component: a component that makes use of one or more other components in
10
+ the processing step. So in reality, there isn't a difference between a pipeline
11
+ and a component! Because of that, in kotaemon, we will consider them the
12
+ same as "component".
13
+
14
+ To define a component, you will:
15
+
16
+ 1. Create a class that subclasses from `kotaemon.base.BaseComponent`
17
+ 2. Declare init params with type annotation
18
+ 3. Declare nodes (nodes are just other components!) with type annotation
19
+ 4. Implement the processing logic in `run`.
20
+
21
+ The syntax of a component is as follow:
22
+
23
+ ```python
24
+ from kotaemon.base import BaseComponent
25
+ from kotaemon.llms import LCAzureChatOpenAI
26
+ from kotaemon.parsers import RegexExtractor
27
+
28
+
29
+ class FancyPipeline(BaseComponent):
30
+ param1: str = "This is param1"
31
+ param2: int = 10
32
+ param3: float
33
+
34
+ node1: BaseComponent # this is a node because of BaseComponent type annotation
35
+ node2: LCAzureChatOpenAI # this is also a node because LCAzureChatOpenAI subclasses BaseComponent
36
+ node3: RegexExtractor # this is also a node bceause RegexExtractor subclasses BaseComponent
37
+
38
+ def run(self, some_text: str):
39
+ prompt = (self.param1 + some_text) * int(self.param2 + self.param3)
40
+ llm_pred = self.node2(prompt).text
41
+ matches = self.node3(llm_pred)
42
+ return matches
43
+ ```
44
+
45
+ Then this component can be used as follow:
46
+
47
+ ```python
48
+ llm = LCAzureChatOpenAI(endpoint="some-endpont")
49
+ extractor = RegexExtractor(pattern=["yes", "Yes"])
50
+
51
+ component = FancyPipeline(
52
+ param1="Hello"
53
+ param3=1.5
54
+ node1=llm,
55
+ node2=llm,
56
+ node3=extractor
57
+ )
58
+ component("goodbye")
59
+ ```
60
+
61
+ This way, we can define each operation as a reusable component, and use them to
62
+ compose larger reusable components!
63
+
64
+ ## Benefits of component
65
+
66
+ By defining a component as above, we formally encapsulate all the necessary
67
+ information inside a single class. This introduces several benefits:
68
+
69
+ 1. Allow tools like promptui to inspect the inner working of a component in
70
+ order to automatically generate the promptui.
71
+ 2. Allow visualizing a pipeline for debugging purpose.
docs/development/data-components.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data & Data Structure Components
2
+
3
+ The data & data structure components include:
4
+
5
+ - The `Document` class.
6
+ - The document store.
7
+ - The vector store.
8
+
9
+ ## Data Loader
10
+
11
+ - PdfLoader
12
+ - Layout-aware with table parsing PdfLoader
13
+
14
+ - MathPixLoader: To use this loader, you need MathPix API key, refer to [mathpix docs](https://docs.mathpix.com/#introduction) for more information
15
+ - OCRLoader: This loader uses lib-table and Flax pipeline to perform OCR and read table structure from PDF file (TODO: add more info about deployment of this module).
16
+ - Output:
17
+
18
+ - Document: text + metadata to identify whether it is table or not
19
+
20
+ ```
21
+ - "source": source file name
22
+ - "type": "table" or "text"
23
+ - "table_origin": original table in markdown format (to be feed to LLM or visualize using external tools)
24
+ - "page_label": page number in the original PDF document
25
+ ```
26
+
27
+ ## Document Store
28
+
29
+ - InMemoryDocumentStore
30
+
31
+ ## Vector Store
32
+
33
+ - ChromaVectorStore
34
+ - InMemoryVectorStore
docs/development/index.md ADDED
@@ -0,0 +1 @@
 
 
1
+ --8<-- "README.md"
docs/development/utilities.md ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utilities
2
+
3
+ ## Prompt engineering UI
4
+
5
+ ![chat-ui](images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png)
6
+
7
+ **_Important:_** despite the name prompt engineering UI, this tool allows testers to test any kind of parameters that are exposed by developers. Prompt is one kind of param. There can be other type of params that testers can tweak (e.g. top_k, temperature...).
8
+
9
+ In the development process, developers typically build the pipeline. However, for use
10
+ cases requiring expertise in prompt creation, non-technical members (testers, domain experts) can be more
11
+ effective. To facilitate this, `kotaemon` offers a user-friendly prompt engineering UI
12
+ that developers integrate into their pipelines. This enables non-technical members to
13
+ adjust prompts and parameters, run experiments, and export results for optimization.
14
+
15
+ As of Sept 2023, there are 2 kinds of prompt engineering UI:
16
+
17
+ - Simple pipeline: run one-way from start to finish.
18
+ - Chat pipeline: interactive back-and-forth.
19
+
20
+ ### Simple pipeline
21
+
22
+ For simple pipeline, the supported client project workflow looks as follow:
23
+
24
+ 1. [tech] Build pipeline
25
+ 2. [tech] Export pipeline to config: `$ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>`
26
+ 3. [tech] Customize the config
27
+ 4. [tech] Spin up prompt engineering UI: `$ kotaemon promptui run <path/to/config/file.yml>`
28
+ 5. [non-tech] Change params, run inference
29
+ 6. [non-tech] Export to Excel
30
+ 7. [non-tech] Select the set of params that achieve the best output
31
+
32
+ The prompt engineering UI prominently involves from step 2 to step 7 (step 1 is normally
33
+ done by the developers, while step 7 happens exclusively in Excel file).
34
+
35
+ #### Step 2 - Export pipeline to config
36
+
37
+ Command:
38
+
39
+ ```shell
40
+ $ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>
41
+ ```
42
+
43
+ where:
44
+
45
+ - `<module.path.pipelineclass>` is a dot-separated path to the pipeline. For example, if your pipeline can be accessed with `from projectA.pipelines import AnsweringPipeline`, then this value is `projectA.pipelines.AnswerPipeline`.
46
+ - `<path/to/config/file.yml>` is the target file path that the config will be exported to. If the config file already exists, and contains information of other pipelines, the config of current pipeline will additionally be added. If it contains information of the current pipeline (in the past), the old information will be replaced.
47
+
48
+ By default, all params in a pipeline (including nested params) will be export to the configuration file. For params that you do not wish to expose to the UI, you can directly remove them from the config YAML file. You can also annotate those param with `ignore_ui=True`, and they will be ignored in the config generation process. Example:
49
+
50
+ ```python
51
+ class Pipeline(BaseComponent):
52
+ param1: str = Param(default="hello")
53
+ param2: str = Param(default="goodbye", ignore_ui=True)
54
+ ```
55
+
56
+ Declared as above, and `param1` will show up in the config YAML file, while `param2` will not.
57
+
58
+ #### Step 3 - Customize the config
59
+
60
+ developers can further edit the config file in this step to get the most suitable UI (step 4) with their tasks. The exported config will have this overall schema:
61
+
62
+ ```yml
63
+ <module.path.pipelineclass1>:
64
+ params: ... (Detail param information to initiate a pipeline. This corresponds to the pipeline init parameters.)
65
+ inputs: ... (Detail the input of the pipeline e.g. a text prompt. This corresponds to the params of `run(...)` method.)
66
+ outputs: ... (Detail the output of the pipeline e.g. prediction, accuracy... This is the output information we wish to see in the UI.)
67
+ logs: ... (Detail what information should show up in the log.)
68
+ ```
69
+
70
+ ##### Input and params
71
+
72
+ The inputs section have the overall schema as follow:
73
+
74
+ ```yml
75
+ inputs:
76
+ <input-variable-name-1>:
77
+ component: <supported-UI-component>
78
+ params: # this section is optional)
79
+ value: <default-value>
80
+ <input-variable-name-2>: ... # similar to above
81
+ params:
82
+ <param-variable-name-1>: ... # similar to those in the inputs
83
+ ```
84
+
85
+ The list of supported prompt UI and their corresponding gradio UI components:
86
+
87
+ ```python
88
+ COMPONENTS_CLASS = {
89
+ "text": gr.components.Textbox,
90
+ "checkbox": gr.components.CheckboxGroup,
91
+ "dropdown": gr.components.Dropdown,
92
+ "file": gr.components.File,
93
+ "image": gr.components.Image,
94
+ "number": gr.components.Number,
95
+ "radio": gr.components.Radio,
96
+ "slider": gr.components.Slider,
97
+ }
98
+ ```
99
+
100
+ ##### Outputs
101
+
102
+ The outputs are a list of variables that we wish to show in the UI. Since in Python, the function output doesn't have variable name, so output declaration is a little bit different than input and param declaration:
103
+
104
+ ```yml
105
+ outputs:
106
+ - component: <supported-UI-component>
107
+ step: <name-of-pipeline-step>
108
+ item: <jsonpath way to retrieve the info>
109
+ - ... # similar to above
110
+ ```
111
+
112
+ where:
113
+
114
+ - component: the same text string and corresponding Gradio UI as in inputs & params
115
+ - step: the pipeline step that we wish to look fetch and show output on the UI
116
+ - item: the jsonpath mechanism to get the targeted variable from the step above
117
+
118
+ ##### Logs
119
+
120
+ The logs show a list of sheetname and how to retrieve the desired information.
121
+
122
+ ```yml
123
+ logs:
124
+ <logname>:
125
+ inputs:
126
+ - name: <column name>
127
+ step: <the pipeline step that we would wish to see the input>
128
+ variable: <the variable in the step>
129
+ - ...
130
+ outputs:
131
+ - name: <column name>
132
+ step: <the pipeline step that we would wish to see the output>
133
+ item: <how to retrieve the output of that step>
134
+ ```
135
+
136
+ #### Step 4 + 5 - Spin up prompt engineering UI + Perform prompt engineering
137
+
138
+ Command:
139
+
140
+ ```shell
141
+ $ kotaemon promptui run <path/to/config/file.yml>
142
+ ```
143
+
144
+ This will generate an UI as follow:
145
+
146
+ ![Screenshot from 2023-09-20 12-20-31](images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png)
147
+
148
+ where:
149
+
150
+ - The tabs at the top of the UI corresponds to the pipeline to do prompt engineering.
151
+ - The inputs and params tabs allow users to edit (these corresponds to the inputs and params in the config file).
152
+ - The outputs panel holds the UI elements to show the outputs defined in config file.
153
+ - The Run button: will execute pipeline with the supplied inputs and params, and render result in the outputs panel.
154
+ - The Export button: will export the logs of all the run to an Excel files users to inspect for best set of params.
155
+
156
+ #### Step 6 - Export to Excel
157
+
158
+ Upon clicking export, the users can download Excel file.
159
+
160
+ ### Chat pipeline
161
+
162
+ Chat pipeline workflow is different from simple pipeline workflow. In simple pipeline, each Run creates a set of output, input and params for users to compare. In chat pipeline, each Run is not a one-off run, but a long interactive session. Hence, the workflow is as follow:
163
+
164
+ 1. Set the desired parameters.
165
+ 2. Click "New chat" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect.
166
+ 3. Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot.
167
+ 4. During chat, the log of the chat will show up in the "Output" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings.
168
+ 5. When finishing chat, select your preference in the radio box. Click "End chat". This will save the chat log and the preference to disk.
169
+ 6. To compare the result of different run, click "Export" to get an Excel spreadsheet summary of different run.
docs/extra/css/code_select.css ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .language-pycon .gp,
2
+ .language-pycon .go {
3
+ /* Generic.Prompt, Generic.Output */
4
+ user-select: none;
5
+ }
docs/images/269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png ADDED
docs/images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png ADDED
docs/images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png ADDED
docs/images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png ADDED
docs/images/change_space_params.png ADDED
docs/images/chat-demo.gif ADDED

Git LFS Details

  • SHA256: 942c79c0aac406bae4b5a4b2a4ff390cdbbffd1d1207e01aac58b3da025cb73b
  • Pointer size: 132 Bytes
  • Size of remote file: 7.11 MB
docs/images/chat-tab-demo.png ADDED
docs/images/chat-tab.png ADDED
docs/images/close_logs_space.png ADDED
docs/images/cohere_api_key.png ADDED
docs/images/duplicate_space.png ADDED
docs/images/file-index-tab.png ADDED
docs/images/index-embedding.png ADDED
docs/images/info-panel-scores.png ADDED
docs/images/initial_startup.png ADDED
docs/images/llm-default.png ADDED
docs/images/models.png ADDED
docs/images/pdf-viewer-setup.png ADDED
docs/images/preview-graph.png ADDED
docs/images/preview.png ADDED