Spaces:

noumanjavaid
/

renesis

Paused

App Files Files Community

noumanjavaid commited on Nov 15, 2024

Commit

ad33df7

verified ·

1 Parent(s): 0e07a73

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.commitlintrc +37 -0
.dockerignore +13 -0
.env.example +44 -0
.gitattributes +4 -35
.github/ISSUE_TEMPLATE/bug_report.yml +85 -0
.github/ISSUE_TEMPLATE/config.yml +1 -0
.github/ISSUE_TEMPLATE/feature_request.yml +49 -0
.github/PULL_REQUEST_TEMPLATE.md +18 -0
.github/workflows/auto-bump-and-release.yaml +62 -0
.github/workflows/build-push-docker.yaml +103 -0
.github/workflows/pr-lint.yaml +77 -0
.github/workflows/style-check.yaml +20 -0
.github/workflows/unit-test.yaml +109 -0
.gitignore +479 -0
.pre-commit-config.yaml +69 -0
.python-version +1 -0
CODE_OF_CONDUCT.md +128 -0
CONTRIBUTING.md +115 -0
Dockerfile +99 -0
LICENSE.txt +201 -0
README.md +365 -8
app.py +25 -0
doc_env_reqs.txt +9 -0
docs/about.md +14 -0
docs/development/contributing.md +116 -0
docs/development/create-a-component.md +71 -0
docs/development/data-components.md +34 -0
docs/development/index.md +1 -0
docs/development/utilities.md +169 -0
docs/extra/css/code_select.css +5 -0
docs/images/269170170-af94ff6b-b8b4-4602-ab6e-2947deb30dff.png +0 -0
docs/images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png +0 -0
docs/images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png +0 -0
docs/images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png +0 -0
docs/images/change_space_params.png +0 -0
docs/images/chat-demo.gif +3 -0
docs/images/chat-tab-demo.png +0 -0
docs/images/chat-tab.png +0 -0
docs/images/close_logs_space.png +0 -0
docs/images/cohere_api_key.png +0 -0
docs/images/duplicate_space.png +0 -0
docs/images/file-index-tab.png +0 -0
docs/images/index-embedding.png +0 -0
docs/images/info-panel-scores.png +0 -0
docs/images/initial_startup.png +0 -0
docs/images/llm-default.png +0 -0
docs/images/models.png +0 -0
docs/images/pdf-viewer-setup.png +0 -0
docs/images/preview-graph.png +0 -0
docs/images/preview.png +0 -0

.commitlintrc ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "extends": ["@commitlint/config-conventional"],
+  "defaultIgnores": true,
+  "rules": {
+    "body-leading-blank": [1, "always"],
+    "body-max-line-length": [2, "always", 100],
+    "footer-leading-blank": [1, "always"],
+    "footer-max-line-length": [2, "always", 10000],
+    "header-max-length": [2, "always", 200],
+    "subject-case": [
+      2,
+      "never",
+      []
+    ],
+    "subject-empty": [2, "never"],
+    "subject-full-stop": [2, "never", "."],
+    "type-case": [2, "always", "lower-case"],
+    "type-empty": [2, "never"],
+    "type-enum": [
+      2,
+      "always",
+      [
+        "build",
+        "chore",
+        "ci",
+        "docs",
+        "feat",
+        "fix",
+        "perf",
+        "refactor",
+        "revert",
+        "style",
+        "test"
+      ]
+    ]
+  }
+}

.dockerignore ADDED Viewed

	@@ -0,0 +1,13 @@

+.github/
+.git/
+.mypy_cache/
+__pycache__/
+ktem_app_data/
+env/
+.pre-commit-config.yaml
+.commitlintrc
+.gitignore
+.gitattributes
+README.md
+*.zip
+*.sh

.env.example ADDED Viewed

	@@ -0,0 +1,44 @@

+# this is an example .env file, use it to create your own .env file and place it in the root of the project
+# settings for OpenAI
+OPENAI_API_BASE=https://api.openai.com/v1
+OPENAI_API_KEY=<YOUR_OPENAI_KEY>
+OPENAI_CHAT_MODEL=gpt-3.5-turbo
+OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
+# settings for Azure OpenAI
+AZURE_OPENAI_ENDPOINT=
+AZURE_OPENAI_API_KEY=
+OPENAI_API_VERSION=2024-02-15-preview
+AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
+AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002
+# settings for Cohere
+COHERE_API_KEY=<COHERE_API_KEY>
+# settings for local models
+LOCAL_MODEL=llama3.1:8b
+LOCAL_MODEL_EMBEDDINGS=nomic-embed-text
+LOCAL_EMBEDDING_MODEL_DIM = 768
+LOCAL_EMBEDDING_MODEL_MAX_TOKENS = 8192
+# settings for GraphRAG
+GRAPHRAG_API_KEY=<YOUR_OPENAI_KEY>
+GRAPHRAG_LLM_MODEL=gpt-4o-mini
+GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small
+# set to true if you want to use customized GraphRAG config file
+USE_CUSTOMIZED_GRAPHRAG_SETTING=false
+# settings for Azure DI
+AZURE_DI_ENDPOINT=
+AZURE_DI_CREDENTIAL=
+# settings for Adobe API
+# get free credential at https://acrobatservices.adobe.com/dc-integration-creation-app-cdn/main.html?api=pdf-extract-api
+# also install pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
+PDF_SERVICES_CLIENT_ID=
+PDF_SERVICES_CLIENT_SECRET=
+# settings for PDF.js
+PDFJS_VERSION_DIST="pdfjs-4.0.379-dist"

.gitattributes CHANGED Viewed

@@ -1,35 +1,4 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.bat   text eol=crlf
+docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text
+kotaemon/docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text
+kotaemon/kotaemon/docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text

.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

	@@ -0,0 +1,85 @@

+name: "Bug Report"
+description: Report something that is not working as expected
+title: "[BUG] "
+labels: ["bug"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        *Please fill this form with as much information as possible.*
+  - type: textarea
+    id: description
+    attributes:
+      label: "Description"
+      description: Please enter an explicit description of your issue
+      placeholder: Short and explicit description of your incident...
+    validations:
+      required: true
+  - type: textarea
+    id: reprod
+    attributes:
+      label: "Reproduction steps"
+      description: Please enter an explicit description of your issue
+      value: |
+        1. Go to '...'
+        2. Click on '....'
+        3. Scroll down to '....'
+        4. See error
+      render: bash
+    validations:
+      required: true
+  - type: textarea
+    id: screenshot
+    attributes:
+      label: "Screenshots"
+      description: If applicable, add screenshots to help explain your problem.
+      value: |
+        ![DESCRIPTION](LINK.png)
+      render: bash
+    validations:
+      required: false
+  - type: textarea
+    id: logs
+    attributes:
+      label: "Logs"
+      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+      render: bash
+    validations:
+      required: false
+  - type: dropdown
+    id: browsers
+    attributes:
+      label: "Browsers"
+      description: What browsers are you seeing the problem on ?
+      multiple: true
+      options:
+        - Firefox
+        - Chrome
+        - Safari
+        - Microsoft Edge
+        - Opera
+        - Brave
+        - Other
+    validations:
+      required: false
+  - type: dropdown
+    id: os
+    attributes:
+      label: "OS"
+      description: What is the impacted environment ?
+      multiple: true
+      options:
+        - Windows
+        - MacOS
+        - Linux
+        - Other
+    validations:
+      required: false
+  - type: textarea
+    id: additional_information
+    attributes:
+      label: "Additional information"
+      description: Add any relevant information or context.
+      placeholder:
+    validations:
+      required: false

.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ blank_issues_enabled: false

.github/ISSUE_TEMPLATE/feature_request.yml ADDED Viewed

	@@ -0,0 +1,49 @@

+name: "Feature Request"
+description: Brainstorm and propose new features for the project
+title: "[REQUEST] "
+labels: ["enhancement"]
+body:
+  - type: markdown
+    attributes:
+      value: |
+        *Please fill this form with as much information as possible.*
+  - type: textarea
+    id: reference_issues
+    attributes:
+      label: "Reference Issues"
+      description: Common issues
+      placeholder: "#Issues IDs"
+    validations:
+      required: false
+  - type: textarea
+    id: summary
+    attributes:
+      label: "Summary"
+      description: Provide a brief explanation of the feature
+      placeholder: Describe in a few lines your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: basic_example
+    attributes:
+      label: "Basic Example"
+      description: Indicate here some basic examples of your feature.
+      placeholder: A few specific words about your feature request.
+    validations:
+      required: true
+  - type: textarea
+    id: drawbacks
+    attributes:
+      label: "Drawbacks"
+      description: What are the drawbacks/impacts of your feature request ?
+      placeholder: Identify the drawbacks and impacts while being neutral on your feature request
+    validations:
+      required: true
+  - type: textarea
+    id: additional_information
+    attributes:
+      label: "Additional information"
+      description: Add any additional information that you think is important for your feature request
+      placeholder:
+    validations:
+      required: false

.github/PULL_REQUEST_TEMPLATE.md ADDED Viewed

	@@ -0,0 +1,18 @@

+## Description
+- Please include a summary of the changes and the related issue.
+- Fixes # (issue)
+## Type of change
+- [ ] New features (non-breaking change).
+- [ ] Bug fix (non-breaking change).
+- [ ] Breaking change (fix or feature that would cause existing functionality not to work as expected).
+## Checklist
+- [ ] I have performed a self-review of my code.
+- [ ] I have added thorough tests if it is a core feature.
+- [ ] There is a reference to the original bug report and related work.
+- [ ] I have commented on my code, particularly in hard-to-understand areas.
+- [ ] The feature is well documented.

.github/workflows/auto-bump-and-release.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+name: Auto Bump and Release
+on:
+  push:
+    branches:
+      - main
+jobs:
+  auto-bump-and-release:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone the repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Update Application Version
+        id: update-version
+        uses: anothrNick/github-tag-action@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WITH_V: true
+          DEFAULT_BUMP: patch
+          MAJOR_STRING_TOKEN: "bump:major"
+          MINOR_STRING_TOKEN: "bump:minor"
+          PATCH_STRING_TOKEN: "bump:patch"
+      - name: Create release for ${{ steps.update-version.outputs.new_tag }}
+        # need to repeat this if statement because Github Action doesn't support early
+        # stopping for steps
+        if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
+        run: |
+          echo Create release folder
+          mkdir kotaemon-app
+          echo ${{ steps.update-version.outputs.new_tag }} > kotaemon-app/VERSION
+          cp LICENSE.txt kotaemon-app/
+          cp flowsettings.py kotaemon-app/
+          cp app.py kotaemon-app/
+          cp .env.example kotaemon-app/.env
+          cp -r scripts kotaemon-app/
+          mkdir -p kotaemon-app/libs/ktem/ktem/
+          cp -r libs/ktem/ktem/assets kotaemon-app/libs/ktem/ktem/
+          tree kotaemon-app
+          zip -r kotaemon-app.zip kotaemon-app
+      - name: Release ${{ steps.update-version.outputs.new_tag }}
+        if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
+        uses: softprops/action-gh-release@v2
+        with:
+          files: kotaemon-app.zip
+          fail_on_unmatched_files: true
+          token: ${{ secrets.GITHUB_TOKEN }}
+          generate_release_notes: true
+          tag_name: ${{ steps.update-version.outputs.new_tag }}
+          make_latest: true
+      - name: Setup latest branch locally without switching current branch
+        if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
+        run: git fetch origin latest:latest
+      - name: Update latest branch
+        if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }}
+        run: |
+          git branch -f latest tags/${{ steps.update-version.outputs.new_tag }}
+          git checkout latest
+          git push -f -u origin latest

.github/workflows/build-push-docker.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+name: Build and Push Docker Image
+on:
+  release:
+    types:
+      - created
+  push:
+    tags:
+      - "v[0-9]+.[0-9]+.[0-9]+"
+  workflow_dispatch:
+env:
+  REGISTRY: ghcr.io
+jobs:
+  build:
+    name: Build and push container
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+      attestations: write
+      id-token: write
+    strategy:
+      matrix:
+        target:
+          - lite
+          - full
+    steps:
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          # this might remove tools that are actually needed,
+          # if set to "true" but frees about 6 GB
+          tool-cache: true
+          # all of these default to true, but feel free to set to
+          # "false" if necessary for your workflow
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+      - name: Set repository and image name
+        run: |
+          echo "FULL_IMAGE_NAME=${{ env.REGISTRY }}/${IMAGE_NAME,,}" >>${GITHUB_ENV}
+        env:
+          IMAGE_NAME: "${{ github.repository }}"
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+        with:
+          image: tonistiigi/binfmt:latest
+          platforms: arm64,arm
+      - name: Set up Docker Buildx
+        id: buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Set up Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.FULL_IMAGE_NAME }}
+          tags: |
+            # branch
+            type=ref,event=branch,suffix=-${{ matrix.target }}
+            # semver with suffix for lite/full targets
+            type=semver,pattern={{version}},suffix=-${{ matrix.target }}
+            # latest tag with suffix for lite/full targets
+            type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, 'pre') }},suffix=-${{ matrix.target }}
+          flavor: |
+            # This is disabled here so we can use the raw form above
+            latest=false
+            # Suffix is not used here since there's no way to disable it above
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - name: Build docker image
+        uses: docker/build-push-action@v6
+        with:
+          file: Dockerfile
+          context: .
+          push: true
+          platforms: linux/amd64, linux/arm64
+          tags: |
+            ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
+          target: ${{ matrix.target }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max

.github/workflows/pr-lint.yaml ADDED Viewed

	@@ -0,0 +1,77 @@

+name: "Lint PR"
+on:
+  pull_request:
+    types:
+      - opened
+      - edited
+      - synchronize
+permissions:
+  pull-requests: write
+jobs:
+  pr-title:
+    name: Validate PR title
+    runs-on: ubuntu-latest
+    permissions: write-all
+    steps:
+      - uses: amannn/action-semantic-pull-request@v5
+        id: lint_pr_title
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - uses: marocchino/sticky-pull-request-comment@v2
+        # When the previous steps fails, the workflow would stop. By adding this
+        # condition you can continue the execution with the populated error message.
+        if: always() && (steps.lint_pr_title.outputs.error_message != null)
+        with:
+          header: pr-title-lint-error
+          message: |
+            Hey there and thank you for opening this pull request! 👋🏼
+            We require pull request titles to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and it looks like your proposed title needs to be adjusted.
+            Details:
+            ```
+            ${{ steps.lint_pr_title.outputs.error_message }}
+            ```
+      # Delete a previous comment when the issue has been resolved
+      - if: ${{ steps.lint_pr_title.outputs.error_message == null }}
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: pr-title-lint-error
+          delete: true
+  commitlint:
+    if: false # Disable this job for now
+    name: Validate commit messages
+    runs-on: ubuntu-latest
+    permissions: write-all
+    steps:
+      - uses: actions/checkout@v4
+      - uses: wagoid/commitlint-github-action@v6
+        id: commitlint
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          configFile: ./.commitlintrc
+      - uses: buildingcash/json-to-markdown-table-action@v1
+        if: always() && (steps.commitlint.outcome != 'success')
+        id: table
+        with:
+          json: ${{ steps.commitlint.outputs.results }}
+      - uses: marocchino/sticky-pull-request-comment@v2
+        if: always() && (steps.commitlint.outcome != 'success')
+        with:
+          header: commitlint-error
+          message: |
+            **All commits** in this PR need to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and [.commitlintrc](${{ github.server_url }}/${{ github.repository }}/blob/${{ github.head_ref || github.ref_name }}/.commitlintrc).
+            Details:
+            ${{ steps.table.outputs.table }}
+      - if: ${{ steps.commitlint.outcome == 'success' }}
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: commitlint-error
+          delete: true

.github/workflows/style-check.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+name: style-check
+on:
+  pull_request:
+    branches: [main, develop]
+  push:
+    branches: [main, develop]
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Clone the repo
+        uses: actions/checkout@v4
+      - name: Setup python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.10"
+      - name: run pre-commit
+        uses: pre-commit/[email protected]

.github/workflows/unit-test.yaml ADDED Viewed

	@@ -0,0 +1,109 @@

+name: unit-test
+on:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+env:
+  THEFLOW_TEMP_PATH: ./tmp
+jobs:
+  unit-test:
+    # if: false # temporary disable this job due to legacy interface
+    #TODO: enable this job after the new interface is ready
+    if: ${{ !cancelled() }}
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 20
+    defaults:
+      run:
+        shell: ${{ matrix.shell }}
+    strategy:
+      matrix:
+        python-version: ["3.10", "3.11"]
+        include:
+          - os: ubuntu-latest
+            shell: bash
+            ACTIVATE_ENV: ". env/bin/activate"
+            GITHUB_OUTPUT: "$GITHUB_OUTPUT"
+          # - os: windows-latest
+          #   shell: pwsh
+          #   ACTIVATE_ENV: env/Scripts/activate.ps1
+          #   GITHUB_OUTPUT: "$env:GITHUB_OUTPUT"
+    name: unit testing with python ${{ matrix.python-version }}
+    steps:
+      - name: Clone the repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+      - name: Get Head Commit Message
+        id: get-head-commit-message
+        run: echo "message=$(git show -s --format=%s)" | tee -a ${{ matrix.GITHUB_OUTPUT }}
+      - name: Check ignore caching
+        id: check-ignore-cache
+        run: |
+          ignore_cache=${{ contains(steps.get-head-commit-message.outputs.message, '[ignore cache]') }}
+          echo "check=$ignore_cache" | tee -a ${{ matrix.GITHUB_OUTPUT }}
+      - name: Set up Python ${{ matrix.python-version }} on ${{ runner.os }}
+        uses: actions/setup-python@v4
+        id: setup_python
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - name: Get cache key
+        id: get-cache-key
+        run: |
+          pip install "setuptools-git-versioning>=2.0,<3"
+          package_version=$(setuptools-git-versioning)
+          cache_key="${{ runner.os }}-py${{ matrix.python-version }}-v${package_version}"
+          echo "key=$cache_key" | tee -a ${{ matrix.GITHUB_OUTPUT }}
+      - name: Try to restore dependencies from ${{ steps.get-cache-key.outputs.key }}
+        id: restore-dependencies
+        if: steps.check-ignore-cache.outputs.check != 'true'
+        uses: actions/cache/restore@v3
+        with:
+          path: ${{ env.pythonLocation }}
+          key: ${{ steps.get-cache-key.outputs.key }}
+          # could using cache of previous ver to reuse unchanged packages
+          restore-keys: ${{ runner.os }}-py${{ matrix.python-version }}
+      - name: Check cache hit
+        id: check-cache-hit
+        run: |
+          echo "cache-hit=${{ steps.restore-dependencies.outputs.cache-hit }}"
+          echo "cache-matched-key=${{ steps.restore-dependencies.outputs.cache-matched-key }}"
+          cache_hit=${{ steps.restore-dependencies.outputs.cache-primary-key == steps.restore-dependencies.outputs.cache-matched-key }}
+          echo "check=$cache_hit" | tee -a ${{ matrix.GITHUB_OUTPUT }}
+      - name: Install additional dependencies (if any)
+        run: |
+          python -m pip install --upgrade pip
+          cd libs/kotaemon
+          pip install -U --upgrade-strategy eager -e .[all]
+      - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
+        if: |
+          steps.check-ignore-cache.outputs.check != 'true' &&
+          steps.check-cache-hit.outputs.check != 'true'
+        uses: actions/cache/save@v3
+        with:
+          path: ${{ env.pythonLocation }}
+          key: ${{ steps.restore-dependencies.outputs.cache-primary-key }}
+      - name: Install OS-based packages
+        run: |
+          sudo apt update -qqy
+          sudo apt install -y poppler-utils libpoppler-dev tesseract-ocr
+      - name: Test kotaemon with pytest
+        run: |
+          pip show pytest
+          cd libs/kotaemon
+          pytest

.gitignore ADDED Viewed

	@@ -0,0 +1,479 @@

+# Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
+# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
+activate*
+activate/*
+kotaemon-env*
+.env
+### Emacs ###
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+# Org-mode
+.org-id-locations
+*_archive
+# flymake-mode
+*_flymake.*
+# eshell files
+/eshell/history
+/eshell/lastdir
+# elpa packages
+/elpa/
+# reftex files
+*.rel
+# AUCTeX auto folder
+/auto/
+# cask packages
+.cask/
+dist/
+# Flycheck
+flycheck_*.el
+# server auth directory
+/server/
+# projectiles files
+.projectile
+# directory configuration
+.dir-locals.el
+# network security
+/network-security.data
+### Linux ###
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### macOS ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### macOS Patch ###
+# iCloud generated files
+*.icloud
+### PyCharm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# AWS User-specific
+.idea/**/aws.xml
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# SonarLint plugin
+.idea/sonarlint/
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+### PyCharm Patch ###
+# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
+# *.iml
+# modules.xml
+# .idea/misc.xml
+# *.ipr
+# Sonarlint plugin
+# https://plugins.jetbrains.com/plugin/7973-sonarlint
+.idea/**/sonarlint/
+# SonarQube Plugin
+# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
+.idea/**/sonarIssues.xml
+# Markdown Navigator plugin
+# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
+.idea/**/markdown-navigator.xml
+.idea/**/markdown-navigator-enh.xml
+.idea/**/markdown-navigator/
+# Cache file creation bug
+# See https://youtrack.jetbrains.com/issue/JBR-2257
+.idea/$CACHE_FILE$
+# CodeStream plugin
+# https://plugins.jetbrains.com/plugin/12206-codestream
+.idea/codestream.xml
+# Azure Toolkit for IntelliJ plugin
+# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
+.idea/**/azureSettings.xml
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+# ruff
+.ruff_cache/
+# LSP config files
+pyrightconfig.json
+### Vim ###
+# Swap
+[._]*.s[a-v][a-z]
+!*.svg # comment out if you don't need vector files
+[._]*.sw[a-p]
+[._]s[a-rt-v][a-z]
+[._]ss[a-gi-z]
+[._]sw[a-p]
+# Session
+Session.vim
+Sessionx.vim
+# Temporary
+.netrwhist
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+### VisualStudioCode ###
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+# Local History for Visual Studio Code
+.history/
+# Built Visual Studio Code Extensions
+*.vsix
+### VisualStudioCode Patch ###
+# Ignore all local history of files
+.history
+.ionide
+### Windows ###
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+# PDF files
+*.pdf
+!libs/kotaemon/tests/resources/*.pdf
+.theflow/
+# End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
+*.py[coid]
+logs/
+.gitsecret/keys/random_seed
+!*.secret
+.envrc
+.env
+S.gpg-agent*
+.vscode/settings.json
+examples/example1/assets
+storage/*
+# Conda and env storages
+*install_dir/
+doc_env/
+# application data
+ktem_app_data/
+gradio_tmp/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,69 @@

+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+      - id: check-yaml
+        args: ["--unsafe"]
+      - id: check-toml
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: mixed-line-ending
+      - id: detect-aws-credentials
+        args: ["--allow-missing-credentials"]
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ["--maxkb=750"]
+      - id: debug-statements
+  - repo: https://github.com/ambv/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+        language_version: python3
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--profile", "black"]
+        language_version: python3.10
+  - repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+        args: ["--max-line-length", "88", "--extend-ignore", "E203"]
+  - repo: https://github.com/myint/autoflake
+    rev: v1.4
+    hooks:
+      - id: autoflake
+        args:
+          [
+            "--in-place",
+            "--remove-unused-variables",
+            "--remove-all-unused-imports",
+            "--ignore-init-module-imports",
+            "--exclude=tests/*",
+          ]
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v2.7.1
+    hooks:
+      - id: prettier
+        types_or: [markdown, yaml]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: "v1.7.1"
+    hooks:
+      - id: mypy
+        additional_dependencies:
+          [
+            types-PyYAML==6.0.12.11,
+            "types-requests",
+            "sqlmodel",
+            "types-Markdown",
+            types-tzlocal,
+          ]
+        args: ["--check-untyped-defs", "--ignore-missing-imports"]
+        exclude: "^templates/"
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.4
+    hooks:
+      - id: codespell
+        additional_dependencies:
+          - tomli

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,128 @@

+# Contributor Covenant Code of Conduct
+## Our Pledge
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+## Our Standards
+Examples of behavior that contributes to a positive environment for our
+community include:
+- Demonstrating empathy and kindness toward other people
+- Being respectful of differing opinions, viewpoints, and experiences
+- Giving and gracefully accepting constructive feedback
+- Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+- Focusing on what is best not just for us as individuals, but for the
+  overall community
+Examples of unacceptable behavior include:
+- The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+- Trolling, insulting or derogatory comments, and personal or political attacks
+- Public or private harassment
+- Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+- Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Enforcement Responsibilities
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+## Scope
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+.
+All complaints will be reviewed and investigated promptly and fairly.
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+## Enforcement Guidelines
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+### 1. Correction
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+### 2. Warning
+**Community Impact**: A violation through a single incident or series
+of actions.
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+### 3. Temporary Ban
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+### 4. Permanent Ban
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,115 @@

+# Contributing to Kotaemon
+Welcome 👋 to the Kotaemon project! We're thrilled that you're interested in contributing. Whether you're fixing bugs, adding new features, or improving documentation, your efforts are highly appreciated. This guide aims to help you get started with contributing to Kotaemon.
+<a href="https://github.com/Cinnamon/kotaemon/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=Cinnamon/kotaemon" />
+</a>
+### Table of Contents
+1. [📖 Code of Conduct](#code-of-conduct)
+2. [🔁 Contributing via Pull Requests](#contributing-via-pull-requests)
+3. [📥 Opening an Issue](#-opening-an-issue)
+4. [📝 Commit Messages](#-commit-messages)
+5. [🧾 License](#-license)
+## 📖 Code of Conduct
+Please review our [code of conduct](./CODE_OF_CONDUCT.md), which is in effect at all times. We expect everyone who contributes to this project to honor it.
+## 🔁 Contributing via Pull Requests
+1. [**Fork the repository**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo): Click on the [Fork](https://github.com/Cinnamon/kotaemon/fork) button on the repository's page to create a copy of Kotaemon under your GitHub account.
+2. [**Clone your code**](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository): Clone your forked repository to your local machine.
+3. [**Create new branch**](https://docs.github.com/en/desktop/making-changes-in-a-branch/managing-branches-in-github-desktop): Create a new branch in your forked repo with a descriptive name that reflects your changes.
+```sh
+git checkout -b descriptive-name-for-your-changes
+```
+4. **Setup the development environment**: If you are working on the code, make sure to install the necessary dependencies for development
+```sh
+pip install -e "libs/kotaemon[dev]
+```
+5. **Make your changes**: Ensure your code follows the project's coding style and passes all test cases.
+   - Check the coding style
+   ```sh
+   pre-commit run --all-files
+   ```
+   - Run the tests
+   ```sh
+   pytest libs/kotaemon/tests/
+   ```
+6. [**Commit your changes**](https://docs.github.com/en/desktop/making-changes-in-a-branch/committing-and-reviewing-changes-to-your-project-in-github-desktop): Once you are done with your changes, add and commit them with clear messages.
+```sh
+git add your_changes.py
+git commit -m "clear message described your changes."
+git push -u origin descriptive-name-for-your-changes
+```
+7. [**Create a pull request**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request): When you are satisfied with your changes, [submit a pull request](https://github.com/Cinnamon/kotaemon/compare) from your forked repository to Kotaemon repository. In the pull request, provide a clear description of your changes and any related issues. For the title of the pull request, please refer to our [commit messages convention](#-commit-messages).
+8. **Wait for reviews**: Wait for the maintainers to review your pull request. If everything is okay, your changes will be merged into the Kotaemon project.
+### GitHub Actions CI Tests
+All pull requests must pass the [GitHub Actions Continuous Integration (CI)](https://docs.github.com/en/actions/about-github-actions/about-continuous-integration-with-github-actions) tests before they can be merged. These tests include coding-style checks, PR title validation, unit tests, etc. to ensure that your changes meet the project's quality standards. Please review and fix any CI failures that arise.
+## 📥 Opening an Issue
+Before [creating an issues](https://github.com/Cinnamon/kotaemon/issues/new/choose), search through existing issues to ensure you are not opening a duplicate. If you are reporting a bug or issue, please provide a reproducible example to help us quickly identify the problem.
+## 📝 Commit Messages
+### Overview
+We use [Angular convention](https://www.conventionalcommits.org/en/) for commit messages to maintain consistency and clarity in our project history. Please take a moment to familiarize yourself with this convention before making your first commit.
+_For the sake of simplicity, we use [squashing merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits) with pull requests. Therefore, if you contribute via a pull request, just make sure your PR's title, instead of the whole commits, follows this convention._
+Commit format:
+```sh
+<gitmoji> <type>(<scope>): <subject>
+<BLANK LINE>
+<body>
+<BLANK LINE>
+<footer>
+```
+Examples:
+```sh
+docs(api): update api doc
+```
+### Commit types
+| Types      | Description                                                   |
+| :--------- | :------------------------------------------------------------ |
+| `feat`     | New features                                                  |
+| `fix`      | Bug fix                                                       |
+| `docs`     | Documentation only changes                                    |
+| `build`    | Changes that affect the build system or external dependencies |
+| `chore`    | Something that doesn’t fit the other types                    |
+| `ci`       | Changes to our CI configuration files and scripts             |
+| `perf`     | Improve performance                                           |
+| `refactor` | Refactor code                                                 |
+| `revert`   | Revert a previous commit                                      |
+| `style`    | Improve structure/format of the code                          |
+| `test`     | Add, update or pass tests                                     |
+## 🧾 License
+All contributions will be licensed under the project's license: [Apache License 2.0](https://github.com/Cinnamon/kotaemon/blob/main/LICENSE.txt).

Dockerfile ADDED Viewed

	@@ -0,0 +1,99 @@

+# Lite version
+FROM python:3.10-slim AS lite
+# Common dependencies
+RUN apt-get update -qqy && \
+    apt-get install -y --no-install-recommends \
+      ssh \
+      git \
+      gcc \
+      g++ \
+      poppler-utils \
+      libpoppler-dev \
+      unzip \
+      curl \
+      cargo
+# Setup args
+ARG TARGETPLATFORM
+ARG TARGETARCH
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=UTF-8
+ENV TARGETARCH=${TARGETARCH}
+# Create working directory
+WORKDIR /app
+# Download pdfjs
+COPY scripts/download_pdfjs.sh /app/scripts/download_pdfjs.sh
+RUN chmod +x /app/scripts/download_pdfjs.sh
+ENV PDFJS_PREBUILT_DIR="/app/libs/ktem/ktem/assets/prebuilt/pdfjs-dist"
+RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR
+# Copy contents
+COPY . /app
+COPY .env.example /app/.env
+# Install pip packages
+RUN --mount=type=ssh  \
+    --mount=type=cache,target=/root/.cache/pip  \
+    pip install -e "libs/kotaemon" \
+    && pip install -e "libs/ktem" \
+    && pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements"
+RUN --mount=type=ssh  \
+    --mount=type=cache,target=/root/.cache/pip  \
+    if [ "$TARGETARCH" = "amd64" ]; then pip install "graphrag<=0.3.6" future; fi
+# Clean up
+RUN apt-get autoremove \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf ~/.cache
+CMD ["python", "app.py"]
+# Full version
+FROM lite AS full
+# Additional dependencies for full version
+RUN apt-get update -qqy && \
+    apt-get install -y --no-install-recommends \
+      tesseract-ocr \
+      tesseract-ocr-jpn \
+      libsm6 \
+      libxext6 \
+      libreoffice \
+      ffmpeg \
+      libmagic-dev
+# Install torch and torchvision for unstructured
+RUN --mount=type=ssh  \
+    --mount=type=cache,target=/root/.cache/pip  \
+    pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+# Install additional pip packages
+RUN --mount=type=ssh  \
+    --mount=type=cache,target=/root/.cache/pip  \
+    pip install -e "libs/kotaemon[adv]" \
+    && pip install unstructured[all-docs]
+# Install lightRAG
+ENV USE_LIGHTRAG=true
+RUN --mount=type=ssh  \
+    --mount=type=cache,target=/root/.cache/pip  \
+    pip install aioboto3 nano-vectordb ollama xxhash lightrag-hku
+# Clean up
+RUN apt-get autoremove \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm -rf ~/.cache
+# Download nltk packages as required for unstructured
+RUN python -c "from unstructured.nlp.tokenize import _download_nltk_packages_if_not_present; _download_nltk_packages_if_not_present()"
+CMD ["python", "app.py"]

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,369 @@
 ---
-title: Renesis
-emoji: 🏢
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 5.5.0
 app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: renesis
 app_file: app.py
+sdk: gradio
+sdk_version: 4.39.0
 ---
+<div align="center">
+# kotaemon
+An open-source clean & customizable RAG UI for chatting with your documents. Built with both end users and
+developers in mind.
+![Preview](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview-graph.png)
+<a href="https://trendshift.io/repositories/11607" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11607" alt="Cinnamon%2Fkotaemon | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+[Live Demo](https://huggingface.co/spaces/cin-model/kotaemon-demo) |
+[Online Install](https://cinnamon.github.io/kotaemon/online_install/) |
+[User Guide](https://cinnamon.github.io/kotaemon/) |
+[Developer Guide](https://cinnamon.github.io/kotaemon/development/) |
+[Feedback](https://github.com/Cinnamon/kotaemon/issues) |
+[Contact](mailto:[email protected])
+[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/release/python-31013/)
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+<a href="https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon" target="_blank">
+<img src="https://img.shields.io/badge/docker_pull-kotaemon:latest-brightgreen" alt="docker pull ghcr.io/cinnamon/kotaemon:latest"></a>
+![download](https://img.shields.io/github/downloads/Cinnamon/kotaemon/total.svg?label=downloads&color=blue)
+<a href='https://huggingface.co/spaces/cin-model/kotaemon-demo'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue'></a>
+<a href="https://hellogithub.com/en/repository/d3141471a0244d5798bc654982b263eb" target="_blank"><img src="https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=d3141471a0244d5798bc654982b263eb&claim_uid=RLiD9UZ1rEHNaMf&theme=small" alt="Featured｜HelloGitHub" /></a>
+</div>
+## Introduction
+This project serves as a functional RAG UI for both end users who want to do QA on their
+documents and developers who want to build their own RAG pipeline.
+<br>
+```yml
++----------------------------------------------------------------------------+
+| End users: Those who use apps built with `kotaemon`.                       |
+| (You use an app like the one in the demo above)                            |
+|     +----------------------------------------------------------------+     |
+|     | Developers: Those who built with `kotaemon`.                   |     |
+|     | (You have `import kotaemon` somewhere in your project)         |     |
+|     |     +----------------------------------------------------+     |     |
+|     |     | Contributors: Those who make `kotaemon` better.    |     |     |
+|     |     | (You make PR to this repo)                         |     |     |
+|     |     +----------------------------------------------------+     |     |
+|     +----------------------------------------------------------------+     |
++----------------------------------------------------------------------------+
+```
+### For end users
+- **Clean & Minimalistic UI**: A user-friendly interface for RAG-based QA.
+- **Support for Various LLMs**: Compatible with LLM API providers (OpenAI, AzureOpenAI, Cohere, etc.) and local LLMs (via `ollama` and `llama-cpp-python`).
+- **Easy Installation**: Simple scripts to get you started quickly.
+### For developers
+- **Framework for RAG Pipelines**: Tools to build your own RAG-based document QA pipeline.
+- **Customizable UI**: See your RAG pipeline in action with the provided UI, built with <a href='https://github.com/gradio-app/gradio'>Gradio <img src='https://img.shields.io/github/stars/gradio-app/gradio'></a>.
+- **Gradio Theme**: If you use Gradio for development, check out our theme here: [kotaemon-gradio-theme](https://github.com/lone17/kotaemon-gradio-theme).
+## Key Features
+- **Host your own document QA (RAG) web-UI**: Support multi-user login, organize your files in private/public collections, collaborate and share your favorite chat with others.
+- **Organize your LLM & Embedding models**: Support both local LLMs & popular API providers (OpenAI, Azure, Ollama, Groq).
+- **Hybrid RAG pipeline**: Sane default RAG pipeline with hybrid (full-text & vector) retriever and re-ranking to ensure best retrieval quality.
+- **Multi-modal QA support**: Perform Question Answering on multiple documents with figures and tables support. Support multi-modal document parsing (selectable options on UI).
+- **Advanced citations with document preview**: By default the system will provide detailed citations to ensure the correctness of LLM answers. View your citations (incl. relevant score) directly in the _in-browser PDF viewer_ with highlights. Warning when retrieval pipeline return low relevant articles.
+- **Support complex reasoning methods**: Use question decomposition to answer your complex/multi-hop question. Support agent-based reasoning with `ReAct`, `ReWOO` and other agents.
+- **Configurable settings UI**: You can adjust most important aspects of retrieval & generation process on the UI (incl. prompts).
+- **Extensible**: Being built on Gradio, you are free to customize or add any UI elements as you like. Also, we aim to support multiple strategies for document indexing & retrieval. `GraphRAG` indexing pipeline is provided as an example.
+![Preview](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview.png)
+## Installation
+> If you are not a developer and just want to use the app, please check out our easy-to-follow [User Guide](https://cinnamon.github.io/kotaemon/). Download the `.zip` file from the [latest release](https://github.com/Cinnamon/kotaemon/releases/latest) to get all the newest features and bug fixes.
+### System requirements
+1. [Python](https://www.python.org/downloads/) >= 3.10
+2. [Docker](https://www.docker.com/): optional, if you [install with Docker](#with-docker-recommended)
+3. [Unstructured](https://docs.unstructured.io/open-source/installation/full-installation#full-installation) if you want to process files other than `.pdf`, `.html`, `.mhtml`, and `.xlsx` documents. Installation steps differ depending on your operating system. Please visit the link and follow the specific instructions provided there.
+### With Docker (recommended)
+1. We support both `lite` & `full` version of Docker images. With `full`, the extra packages of `unstructured` will be installed as well, it can support additional file types (`.doc`, `.docx`, ...) but the cost is larger docker image size. For most users, the `lite` image should work well in most cases.
+   - To use the `lite` version.
+     ```bash
+     docker run \
+     -e GRADIO_SERVER_NAME=0.0.0.0 \
+     -e GRADIO_SERVER_PORT=7860 \
+     -p 7860:7860 -it --rm \
+     ghcr.io/cinnamon/kotaemon:main-lite
+     ```
+   - To use the `full` version.
+     ```bash
+     docker run \
+     -e GRADIO_SERVER_NAME=0.0.0.0 \
+     -e GRADIO_SERVER_PORT=7860 \
+     -p 7860:7860 -it --rm \
+     ghcr.io/cinnamon/kotaemon:main-full
+     ```
+2. We currently support and test two platforms: `linux/amd64` and `linux/arm64` (for newer Mac). You can specify the platform by passing `--platform` in the `docker run` command. For example:
+   ```bash
+   # To run docker with platform linux/arm64
+   docker run \
+   -e GRADIO_SERVER_NAME=0.0.0.0 \
+   -e GRADIO_SERVER_PORT=7860 \
+   -p 7860:7860 -it --rm \
+   --platform linux/arm64 \
+   ghcr.io/cinnamon/kotaemon:main-lite
+   ```
+3. Once everything is set up correctly, you can go to `http://localhost:7860/` to access the WebUI.
+4. We use [GHCR](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) to store docker images, all images can be found [here.](https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon)
+### Without Docker
+1. Clone and install required packages on a fresh python environment.
+   ```shell
+   # optional (setup env)
+   conda create -n kotaemon python=3.10
+   conda activate kotaemon
+   # clone this repo
+   git clone https://github.com/Cinnamon/kotaemon
+   cd kotaemon
+   pip install -e "libs/kotaemon[all]"
+   pip install -e "libs/ktem"
+   ```
+2. Create a `.env` file in the root of this project. Use `.env.example` as a template
+   The `.env` file is there to serve use cases where users want to pre-config the models before starting up the app (e.g. deploy the app on HF hub). The file will only be used to populate the db once upon the first run, it will no longer be used in consequent runs.
+3. (Optional) To enable in-browser `PDF_JS` viewer, download [PDF_JS_DIST](https://github.com/mozilla/pdf.js/releases/download/v4.0.379/pdfjs-4.0.379-dist.zip) then extract it to `libs/ktem/ktem/assets/prebuilt`
+<img src="https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/pdf-viewer-setup.png" alt="pdf-setup" width="300">
+4. Start the web server:
+   ```shell
+   python app.py
+   ```
+   - The app will be automatically launched in your browser.
+   - Default username and password are both `admin`. You can set up additional users directly through the UI.
+   ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png)
+5. Check the `Resources` tab and `LLMs and Embeddings` and ensure that your `api_key` value is set correctly from your `.env` file. If it is not set, you can set it there.
+### Setup GraphRAG
+> [!NOTE]
+> Official MS GraphRAG indexing only works with OpenAI or Ollama API.
+> We recommend most users to use NanoGraphRAG implementation for straightforward integration with Kotaemon.
+<details>
+<summary>Setup Nano GRAPHRAG</summary>
+- Install nano-GraphRAG: `pip install nano-graphrag`
+- `nano-graphrag` install might introduce version conflicts, see [this issue](https://github.com/Cinnamon/kotaemon/issues/440)
+  - To quickly fix: `pip uninstall hnswlib chroma-hnswlib && pip install chroma-hnswlib`
+- Launch Kotaemon with `USE_NANO_GRAPHRAG=true` environment variable.
+- Set your default LLM & Embedding models in Resources setting and it will be recognized automatically from NanoGraphRAG.
+</details>
+<details>
+<summary>Setup LIGHTRAG</summary>
+- Install LightRAG: `pip install git+https://github.com/HKUDS/LightRAG.git`
+- `LightRAG` install might introduce version conflicts, see [this issue](https://github.com/Cinnamon/kotaemon/issues/440)
+  - To quickly fix: `pip uninstall hnswlib chroma-hnswlib && pip install chroma-hnswlib`
+- Launch Kotaemon with `USE_LIGHTRAG=true` environment variable.
+- Set your default LLM & Embedding models in Resources setting and it will be recognized automatically from LightRAG.
+</details>
+<details>
+<summary>Setup MS GRAPHRAG</summary>
+- **Non-Docker Installation**: If you are not using Docker, install GraphRAG with the following command:
+  ```shell
+  pip install "graphrag<=0.3.6" future
+  ```
+- **Setting Up API KEY**: To use the GraphRAG retriever feature, ensure you set the `GRAPHRAG_API_KEY` environment variable. You can do this directly in your environment or by adding it to a `.env` file.
+- **Using Local Models and Custom Settings**: If you want to use GraphRAG with local models (like `Ollama`) or customize the default LLM and other configurations, set the `USE_CUSTOMIZED_GRAPHRAG_SETTING` environment variable to true. Then, adjust your settings in the `settings.yaml.example` file.
+</details>
+### Setup Local Models (for local/private RAG)
+See [Local model setup](docs/local_model.md).
+### Customize your application
+- By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.
+- For advanced users or specific use cases, you can customize these files:
+  - `flowsettings.py`
+  - `.env`
+#### `flowsettings.py`
+This file contains the configuration of your application. You can use the example
+[here](flowsettings.py) as the starting point.
+<details>
+<summary>Notable settings</summary>
+```python
+# setup your preferred document store (with full-text search capabilities)
+KH_DOCSTORE=(Elasticsearch | LanceDB | SimpleFileDocumentStore)
+# setup your preferred vectorstore (for vector-based search)
+KH_VECTORSTORE=(ChromaDB | LanceDB | InMemory | Qdrant)
+# Enable / disable multimodal QA
+KH_REASONINGS_USE_MULTIMODAL=True
+# Setup your new reasoning pipeline or modify existing one.
+KH_REASONINGS = [
+    "ktem.reasoning.simple.FullQAPipeline",
+    "ktem.reasoning.simple.FullDecomposeQAPipeline",
+    "ktem.reasoning.react.ReactAgentPipeline",
+    "ktem.reasoning.rewoo.RewooAgentPipeline",
+]
+```
+</details>
+#### `.env`
+This file provides another way to configure your models and credentials.
+<details>
+<summary>Configure model via the .env file</summary>
+- Alternatively, you can configure the models via the `.env` file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don't see it, you can create one.
+- Currently, the following providers are supported:
+  - **OpenAI**
+    In the `.env` file, set the `OPENAI_API_KEY` variable with your OpenAI API key in order
+    to enable access to OpenAI's models. There are other variables that can be modified,
+    please feel free to edit them to fit your case. Otherwise, the default parameter should
+    work for most people.
+    ```shell
+    OPENAI_API_BASE=https://api.openai.com/v1
+    OPENAI_API_KEY=<your OpenAI API key here>
+    OPENAI_CHAT_MODEL=gpt-3.5-turbo
+    OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
+    ```
+  - **Azure OpenAI**
+    For OpenAI models via Azure platform, you need to provide your Azure endpoint and API
+    key. Your might also need to provide your developments' name for the chat model and the
+    embedding model depending on how you set up Azure development.
+    ```shell
+    AZURE_OPENAI_ENDPOINT=
+    AZURE_OPENAI_API_KEY=
+    OPENAI_API_VERSION=2024-02-15-preview
+    AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
+    AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002
+    ```
+  - **Local Models**
+    - Using `ollama` OpenAI compatible server:
+      - Install [ollama](https://github.com/ollama/ollama) and start the application.
+      - Pull your model, for example:
+        ```shell
+        ollama pull llama3.1:8b
+        ollama pull nomic-embed-text
+        ```
+      - Set the model names on web UI and make it as default:
+        ![Models](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/models.png)
+    - Using `GGUF` with `llama-cpp-python`
+      You can search and download a LLM to be ran locally from the [Hugging Face Hub](https://huggingface.co/models). Currently, these model formats are supported:
+      - GGUF
+        You should choose a model whose size is less than your device's memory and should leave
+        about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available,
+        then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to
+        give better generation but also take more processing time.
+        Here are some recommendations and their size in memory:
+      - [Qwen1.5-1.8B-Chat-GGUF](https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q8_0.gguf?download=true): around 2 GB
+        Add a new LlamaCpp model with the provided model name on the web UI.
+  </details>
+### Adding your own RAG pipeline
+#### Custom Reasoning Pipeline
+1. Check the default pipeline implementation in [here](libs/ktem/ktem/reasoning/simple.py). You can make quick adjustment to how the default QA pipeline work.
+2. Add new `.py` implementation in `libs/ktem/ktem/reasoning/` and later include it in `flowssettings` to enable it on the UI.
+#### Custom Indexing Pipeline
+- Check sample implementation in `libs/ktem/ktem/index/file/graph`
+> (more instruction WIP).
+## Star History
+<a href="https://star-history.com/#Cinnamon/kotaemon&Date">
+ <picture>
+   <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Cinnamon/kotaemon&type=Date&theme=dark" />
+   <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Cinnamon/kotaemon&type=Date" />
+   <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Cinnamon/kotaemon&type=Date" />
+ </picture>
+</a>
+## Contribution
+Since our project is actively being developed, we greatly value your feedback and contributions. Please see our [Contributing Guide](https://github.com/Cinnamon/kotaemon/blob/main/CONTRIBUTING.md) to get started. Thank you to all our contributors!
+<a href="https://github.com/Cinnamon/kotaemon/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=Cinnamon/kotaemon" />
+</a>

app.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from theflow.settings import settings as flowsettings
+KH_APP_DATA_DIR = getattr(flowsettings, "KH_APP_DATA_DIR", ".")
+GRADIO_TEMP_DIR = os.getenv("GRADIO_TEMP_DIR", None)
+# override GRADIO_TEMP_DIR if it's not set
+if GRADIO_TEMP_DIR is None:
+    GRADIO_TEMP_DIR = os.path.join(KH_APP_DATA_DIR, "gradio_tmp")
+    os.environ["GRADIO_TEMP_DIR"] = GRADIO_TEMP_DIR
+from ktem.main import App  # noqa
+app = App()
+demo = app.make()
+demo.queue().launch(
+    share=True,
+    favicon_path=app._favicon,
+    inbrowser=True,
+    allowed_paths=[
+        "libs/ktem/ktem/assets",
+        GRADIO_TEMP_DIR,
+    ],
+)

doc_env_reqs.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+mkdocs
+mkdocstrings[python]
+mkdocs-material
+mkdocs-gen-files
+mkdocs-literate-nav
+mkdocs-video
+mkdocs-git-revision-date-localized-plugin
+mkdocs-section-index
+mdx_truly_sane_lists

docs/about.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# About Kotaemon
+An open-source tool for chatting with your documents. Built with both end users and
+developers in mind.
+[Source Code](https://github.com/Cinnamon/kotaemon) |
+[Live Demo](https://huggingface.co/spaces/cin-model/kotaemon-demo)
+[User Guide](https://cinnamon.github.io/kotaemon/) |
+[Developer Guide](https://cinnamon.github.io/kotaemon/development/) |
+[Feedback](https://github.com/Cinnamon/kotaemon/issues)
+[Dark Mode](?__theme=dark) |
+[Light Mode](?__theme=light)

docs/development/contributing.md ADDED Viewed

	@@ -0,0 +1,116 @@

+# Contributing
+## Setting up
+- Clone the repo
+  ```shell
+  git clone [email protected]:Cinnamon/kotaemon.git
+  cd kotaemon
+  ```
+- Install the environment
+  - Create a conda environment (python >= 3.10 is recommended)
+    ```shell
+    conda create -n kotaemon python=3.10
+    conda activate kotaemon
+    # install dependencies
+    cd libs/kotaemon
+    pip install -e ".[all]"
+    ```
+  - Or run the installer (one of the `scripts/run_*` scripts depends on your OS), then
+    you will have all the dependencies installed as a conda environment at
+    `install_dir/env`.
+    ```shell
+    conda activate install_dir/env
+    ```
+- Pre-commit
+  ```shell
+  pre-commit install
+  ```
+- Test
+  ```shell
+  pytest tests
+  ```
+## Package overview
+`kotaemon` library focuses on the AI building blocks to implement a RAG-based QA application. It consists of base interfaces, core components and a list of utilities:
+- Base interfaces: `kotaemon` defines the base interface of a component in a pipeline. A pipeline is also a component. By clearly define this interface, a pipeline of steps can be easily constructed and orchestrated.
+- Core components: `kotaemon` implements (or wraps 3rd-party libraries
+  like Langchain, llama-index,... when possible) commonly used components in
+  kotaemon use cases. Some of these components are: LLM, vector store,
+  document store, retriever... For a detailed list and description of these
+  components, please refer to the [API Reference](../reference/Summary.md) section.
+- List of utilities: `kotaemon` provides utilities and tools that are
+  usually needed in client project. For example, it provides a prompt
+  engineering UI for AI developers in a project to quickly create a prompt
+  engineering tool for DMs and QALs. It also provides a command to quickly spin
+  up a project code base. For a full list and description of these utilities,
+  please refer to the [Utilities](utilities.md) section.
+```mermaid
+mindmap
+  root((kotaemon))
+    Base Interfaces
+      Document
+      LLMInterface
+      RetrievedDocument
+      BaseEmbeddings
+      BaseChat
+      BaseCompletion
+      ...
+    Core Components
+      LLMs
+        AzureOpenAI
+        OpenAI
+      Embeddings
+        AzureOpenAI
+        OpenAI
+        HuggingFaceEmbedding
+      VectorStore
+        InMemoryVectorstore
+        ChromaVectorstore
+      Agent
+      Tool
+      DocumentStore
+      ...
+    Utilities
+      Scaffold project
+      PromptUI
+      Documentation Support
+```
+## Common conventions
+- PR title: One-line description (example: Feat: Declare BaseComponent and decide LLM call interface).
+- [Encouraged] Provide a quick description in the PR, so that:
+  - Reviewers can quickly understand the direction of the PR.
+  - It will be included in the commit message when the PR is merged.
+## Environment caching on PR
+- To speed up CI, environments are cached based on the version specified in `__init__.py`.
+- Since dependencies versions in `setup.py` are not pinned, you need to pump the version in order to use a new environment. That environment will then be cached and used by your subsequence commits within the PR, until you pump the version again
+- The new environment created during your PR is cached and will be available to others once the PR is merged.
+- If you are experimenting with new dependencies and want a fresh environment every time, add `[ignore cache]` in your commit message. The CI will create a fresh environment to run your commit and then discard it.
+- If your PR include updated dependencies, the recommended workflow would be:
+  - Doing development as usual.
+  - When you want to run the CI, push a commit with the message containing `[ignore cache]`.
+  - Once the PR is final, pump the version in `__init__.py` and push a final commit not containing `[ignore cache]`.
+## Merge PR guideline
+- Use squash and merge option
+- 1st line message is the PR title.
+- The text area is the PR description.

docs/development/create-a-component.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Creating a component
+A fundamental concept in kotaemon is "component".
+Anything that isn't data or data structure is a "component". A component can be
+thought of as a step within a pipeline. It takes in some input, processes it,
+and returns an output, just the same as a Python function! The output will then
+become an input for the next component in a pipeline. In fact, a pipeline is just
+a component. More appropriately, a nested component: a component that makes use of one or more other components in
+the processing step. So in reality, there isn't a difference between a pipeline
+and a component! Because of that, in kotaemon, we will consider them the
+same as "component".
+To define a component, you will:
+1. Create a class that subclasses from `kotaemon.base.BaseComponent`
+2. Declare init params with type annotation
+3. Declare nodes (nodes are just other components!) with type annotation
+4. Implement the processing logic in `run`.
+The syntax of a component is as follow:
+```python
+from kotaemon.base import BaseComponent
+from kotaemon.llms import LCAzureChatOpenAI
+from kotaemon.parsers import RegexExtractor
+class FancyPipeline(BaseComponent):
+    param1: str = "This is param1"
+    param2: int = 10
+    param3: float
+    node1: BaseComponent    # this is a node because of BaseComponent type annotation
+    node2: LCAzureChatOpenAI  # this is also a node because LCAzureChatOpenAI subclasses BaseComponent
+    node3: RegexExtractor   # this is also a node bceause RegexExtractor subclasses BaseComponent
+    def run(self, some_text: str):
+        prompt = (self.param1 + some_text) * int(self.param2 + self.param3)
+        llm_pred = self.node2(prompt).text
+        matches = self.node3(llm_pred)
+        return matches
+```
+Then this component can be used as follow:
+```python
+llm = LCAzureChatOpenAI(endpoint="some-endpont")
+extractor = RegexExtractor(pattern=["yes", "Yes"])
+component = FancyPipeline(
+    param1="Hello"
+    param3=1.5
+    node1=llm,
+    node2=llm,
+    node3=extractor
+)
+component("goodbye")
+```
+This way, we can define each operation as a reusable component, and use them to
+compose larger reusable components!
+## Benefits of component
+By defining a component as above, we formally encapsulate all the necessary
+information inside a single class. This introduces several benefits:
+1. Allow tools like promptui to inspect the inner working of a component in
+   order to automatically generate the promptui.
+2. Allow visualizing a pipeline for debugging purpose.

docs/development/data-components.md ADDED Viewed

	@@ -0,0 +1,34 @@

+# Data & Data Structure Components
+The data & data structure components include:
+- The `Document` class.
+- The document store.
+- The vector store.
+## Data Loader
+- PdfLoader
+- Layout-aware with table parsing PdfLoader
+  - MathPixLoader: To use this loader, you need MathPix API key, refer to [mathpix docs](https://docs.mathpix.com/#introduction) for more information
+  - OCRLoader: This loader uses lib-table and Flax pipeline to perform OCR and read table structure from PDF file (TODO: add more info about deployment of this module).
+  - Output:
+    - Document: text + metadata to identify whether it is table or not
+      ```
+      - "source": source file name
+      - "type": "table" or "text"
+      - "table_origin": original table in markdown format (to be feed to LLM or visualize using external tools)
+      - "page_label": page number in the original PDF document
+      ```
+## Document Store
+- InMemoryDocumentStore
+## Vector Store
+- ChromaVectorStore
+- InMemoryVectorStore

docs/development/index.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ --8<-- "README.md"

docs/development/utilities.md ADDED Viewed

	@@ -0,0 +1,169 @@

+# Utilities
+## Prompt engineering UI
+![chat-ui](images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png)
+**_Important:_** despite the name prompt engineering UI, this tool allows testers to test any kind of parameters that are exposed by developers. Prompt is one kind of param. There can be other type of params that testers can tweak (e.g. top_k, temperature...).
+In the development process, developers typically build the pipeline. However, for use
+cases requiring expertise in prompt creation, non-technical members (testers, domain experts) can be more
+effective. To facilitate this, `kotaemon` offers a user-friendly prompt engineering UI
+that developers integrate into their pipelines. This enables non-technical members to
+adjust prompts and parameters, run experiments, and export results for optimization.
+As of Sept 2023, there are 2 kinds of prompt engineering UI:
+- Simple pipeline: run one-way from start to finish.
+- Chat pipeline: interactive back-and-forth.
+### Simple pipeline
+For simple pipeline, the supported client project workflow looks as follow:
+1. [tech] Build pipeline
+2. [tech] Export pipeline to config: `$ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>`
+3. [tech] Customize the config
+4. [tech] Spin up prompt engineering UI: `$ kotaemon promptui run <path/to/config/file.yml>`
+5. [non-tech] Change params, run inference
+6. [non-tech] Export to Excel
+7. [non-tech] Select the set of params that achieve the best output
+The prompt engineering UI prominently involves from step 2 to step 7 (step 1 is normally
+done by the developers, while step 7 happens exclusively in Excel file).
+#### Step 2 - Export pipeline to config
+Command:
+```shell
+$ kotaemon promptui export <module.path.piplineclass> --output <path/to/config/file.yml>
+```
+where:
+- `<module.path.pipelineclass>` is a dot-separated path to the pipeline. For example, if your pipeline can be accessed with `from projectA.pipelines import AnsweringPipeline`, then this value is `projectA.pipelines.AnswerPipeline`.
+- `<path/to/config/file.yml>` is the target file path that the config will be exported to. If the config file already exists, and contains information of other pipelines, the config of current pipeline will additionally be added. If it contains information of the current pipeline (in the past), the old information will be replaced.
+By default, all params in a pipeline (including nested params) will be export to the configuration file. For params that you do not wish to expose to the UI, you can directly remove them from the config YAML file. You can also annotate those param with `ignore_ui=True`, and they will be ignored in the config generation process. Example:
+```python
+class Pipeline(BaseComponent):
+    param1: str = Param(default="hello")
+    param2: str = Param(default="goodbye", ignore_ui=True)
+```
+Declared as above, and `param1` will show up in the config YAML file, while `param2` will not.
+#### Step 3 - Customize the config
+developers can further edit the config file in this step to get the most suitable UI (step 4) with their tasks. The exported config will have this overall schema:
+```yml
+<module.path.pipelineclass1>:
+  params: ... (Detail param information to initiate a pipeline. This corresponds to the pipeline init parameters.)
+  inputs: ... (Detail the input of the pipeline e.g. a text prompt. This corresponds to the params of `run(...)` method.)
+  outputs: ... (Detail the output of the pipeline e.g. prediction, accuracy... This is the output information we wish to see in the UI.)
+  logs: ... (Detail what information should show up in the log.)
+```
+##### Input and params
+The inputs section have the overall schema as follow:
+```yml
+inputs:
+  <input-variable-name-1>:
+    component: <supported-UI-component>
+    params: # this section is optional)
+      value: <default-value>
+  <input-variable-name-2>: ... # similar to above
+params:
+  <param-variable-name-1>: ... # similar to those in the inputs
+```
+The list of supported prompt UI and their corresponding gradio UI components:
+```python
+COMPONENTS_CLASS = {
+    "text": gr.components.Textbox,
+    "checkbox": gr.components.CheckboxGroup,
+    "dropdown": gr.components.Dropdown,
+    "file": gr.components.File,
+    "image": gr.components.Image,
+    "number": gr.components.Number,
+    "radio": gr.components.Radio,
+    "slider": gr.components.Slider,
+}
+```
+##### Outputs
+The outputs are a list of variables that we wish to show in the UI. Since in Python, the function output doesn't have variable name, so output declaration is a little bit different than input and param declaration:
+```yml
+outputs:
+  - component: <supported-UI-component>
+    step: <name-of-pipeline-step>
+    item: <jsonpath way to retrieve the info>
+  - ... # similar to above
+```
+where:
+- component: the same text string and corresponding Gradio UI as in inputs & params
+- step: the pipeline step that we wish to look fetch and show output on the UI
+- item: the jsonpath mechanism to get the targeted variable from the step above
+##### Logs
+The logs show a list of sheetname and how to retrieve the desired information.
+```yml
+logs:
+  <logname>:
+    inputs:
+      - name: <column name>
+        step: <the pipeline step that we would wish to see the input>
+        variable: <the variable in the step>
+      - ...
+    outputs:
+      - name: <column name>
+        step: <the pipeline step that we would wish to see the output>
+        item: <how to retrieve the output of that step>
+```
+#### Step 4 + 5 - Spin up prompt engineering UI + Perform prompt engineering
+Command:
+```shell
+$ kotaemon promptui run <path/to/config/file.yml>
+```
+This will generate an UI as follow:
+![Screenshot from 2023-09-20 12-20-31](images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png)
+where:
+- The tabs at the top of the UI corresponds to the pipeline to do prompt engineering.
+- The inputs and params tabs allow users to edit (these corresponds to the inputs and params in the config file).
+- The outputs panel holds the UI elements to show the outputs defined in config file.
+- The Run button: will execute pipeline with the supplied inputs and params, and render result in the outputs panel.
+- The Export button: will export the logs of all the run to an Excel files users to inspect for best set of params.
+#### Step 6 - Export to Excel
+Upon clicking export, the users can download Excel file.
+### Chat pipeline
+Chat pipeline workflow is different from simple pipeline workflow. In simple pipeline, each Run creates a set of output, input and params for users to compare. In chat pipeline, each Run is not a one-off run, but a long interactive session. Hence, the workflow is as follow:
+1. Set the desired parameters.
+2. Click "New chat" to start a chat session with the supplied parameters. This set of parameters will persist until the end of the chat session. During an ongoing chat session, changing the parameters will not take any effect.
+3. Chat and interact with the chat bot on the right panel. You can add any additional input (if any), and they will be supplied to the chatbot.
+4. During chat, the log of the chat will show up in the "Output" tabs. This is empty by default, so if you want to show the log here, tell the AI developers to configure the UI settings.
+5. When finishing chat, select your preference in the radio box. Click "End chat". This will save the chat log and the preference to disk.
+6. To compare the result of different run, click "Export" to get an Excel spreadsheet summary of different run.