diff --git a/.commitlintrc b/.commitlintrc new file mode 100644 index 0000000000000000000000000000000000000000..6572ee32b536a2160cd1ffc457c52930212ccdb8 --- /dev/null +++ b/.commitlintrc @@ -0,0 +1,37 @@ +{ + "extends": ["@commitlint/config-conventional"], + "defaultIgnores": true, + "rules": { + "body-leading-blank": [1, "always"], + "body-max-line-length": [2, "always", 100], + "footer-leading-blank": [1, "always"], + "footer-max-line-length": [2, "always", 10000], + "header-max-length": [2, "always", 200], + "subject-case": [ + 2, + "never", + [] + ], + "subject-empty": [2, "never"], + "subject-full-stop": [2, "never", "."], + "type-case": [2, "always", "lower-case"], + "type-empty": [2, "never"], + "type-enum": [ + 2, + "always", + [ + "build", + "chore", + "ci", + "docs", + "feat", + "fix", + "perf", + "refactor", + "revert", + "style", + "test" + ] + ] + } +} diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..0a573121140d734efc65c4ee42fbe67ad8398c9f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,13 @@ +.github/ +.git/ +.mypy_cache/ +__pycache__/ +ktem_app_data/ +env/ +.pre-commit-config.yaml +.commitlintrc +.gitignore +.gitattributes +README.md +*.zip +*.sh diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..c92d51114fb0f7f19e5ec0bd14beeb5a83d1c681 --- /dev/null +++ b/.env.example @@ -0,0 +1,44 @@ +# this is an example .env file, use it to create your own .env file and place it in the root of the project + +# settings for OpenAI +OPENAI_API_BASE=https://api.openai.com/v1 +OPENAI_API_KEY= +OPENAI_CHAT_MODEL=gpt-3.5-turbo +OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002 + +# settings for Azure OpenAI +AZURE_OPENAI_ENDPOINT= +AZURE_OPENAI_API_KEY= +OPENAI_API_VERSION=2024-02-15-preview +AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo +AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002 + +# settings for Cohere +COHERE_API_KEY= + +# settings for local models +LOCAL_MODEL=llama3.1:8b +LOCAL_MODEL_EMBEDDINGS=nomic-embed-text +LOCAL_EMBEDDING_MODEL_DIM = 768 +LOCAL_EMBEDDING_MODEL_MAX_TOKENS = 8192 + +# settings for GraphRAG +GRAPHRAG_API_KEY= +GRAPHRAG_LLM_MODEL=gpt-4o-mini +GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small + +# set to true if you want to use customized GraphRAG config file +USE_CUSTOMIZED_GRAPHRAG_SETTING=false + +# settings for Azure DI +AZURE_DI_ENDPOINT= +AZURE_DI_CREDENTIAL= + +# settings for Adobe API +# get free credential at https://acrobatservices.adobe.com/dc-integration-creation-app-cdn/main.html?api=pdf-extract-api +# also install pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements" +PDF_SERVICES_CLIENT_ID= +PDF_SERVICES_CLIENT_SECRET= + +# settings for PDF.js +PDFJS_VERSION_DIST="pdfjs-4.0.379-dist" diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..495d0316c6137c30f2bd7f8e658c876c15ac2e60 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,35 +1,4 @@ -*.7z filter=lfs diff=lfs merge=lfs -text -*.arrow filter=lfs diff=lfs merge=lfs -text -*.bin filter=lfs diff=lfs merge=lfs -text -*.bz2 filter=lfs diff=lfs merge=lfs -text -*.ckpt filter=lfs diff=lfs merge=lfs -text -*.ftz filter=lfs diff=lfs merge=lfs -text -*.gz filter=lfs diff=lfs merge=lfs -text -*.h5 filter=lfs diff=lfs merge=lfs -text -*.joblib filter=lfs diff=lfs merge=lfs -text -*.lfs.* filter=lfs diff=lfs merge=lfs -text -*.mlmodel filter=lfs diff=lfs merge=lfs -text -*.model filter=lfs diff=lfs merge=lfs -text -*.msgpack filter=lfs diff=lfs merge=lfs -text -*.npy filter=lfs diff=lfs merge=lfs -text -*.npz filter=lfs diff=lfs merge=lfs -text -*.onnx filter=lfs diff=lfs merge=lfs -text -*.ot filter=lfs diff=lfs merge=lfs -text -*.parquet filter=lfs diff=lfs merge=lfs -text -*.pb filter=lfs diff=lfs merge=lfs -text -*.pickle filter=lfs diff=lfs merge=lfs -text -*.pkl filter=lfs diff=lfs merge=lfs -text -*.pt filter=lfs diff=lfs merge=lfs -text -*.pth filter=lfs diff=lfs merge=lfs -text -*.rar filter=lfs diff=lfs merge=lfs -text -*.safetensors filter=lfs diff=lfs merge=lfs -text -saved_model/**/* filter=lfs diff=lfs merge=lfs -text -*.tar.* filter=lfs diff=lfs merge=lfs -text -*.tar filter=lfs diff=lfs merge=lfs -text -*.tflite filter=lfs diff=lfs merge=lfs -text -*.tgz filter=lfs diff=lfs merge=lfs -text -*.wasm filter=lfs diff=lfs merge=lfs -text -*.xz filter=lfs diff=lfs merge=lfs -text -*.zip filter=lfs diff=lfs merge=lfs -text -*.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text +*.bat text eol=crlf +docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text +kotaemon/docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text +kotaemon/kotaemon/docs/images/chat-demo.gif filter=lfs diff=lfs merge=lfs -text diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000000000000000000000000000000000000..9110eac98da231d56149c79adf7600c8ae878fe3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,85 @@ +name: "Bug Report" +description: Report something that is not working as expected +title: "[BUG] " +labels: ["bug"] +body: + - type: markdown + attributes: + value: | + *Please fill this form with as much information as possible.* + - type: textarea + id: description + attributes: + label: "Description" + description: Please enter an explicit description of your issue + placeholder: Short and explicit description of your incident... + validations: + required: true + - type: textarea + id: reprod + attributes: + label: "Reproduction steps" + description: Please enter an explicit description of your issue + value: | + 1. Go to '...' + 2. Click on '....' + 3. Scroll down to '....' + 4. See error + render: bash + validations: + required: true + - type: textarea + id: screenshot + attributes: + label: "Screenshots" + description: If applicable, add screenshots to help explain your problem. + value: | + ![DESCRIPTION](LINK.png) + render: bash + validations: + required: false + - type: textarea + id: logs + attributes: + label: "Logs" + description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. + render: bash + validations: + required: false + - type: dropdown + id: browsers + attributes: + label: "Browsers" + description: What browsers are you seeing the problem on ? + multiple: true + options: + - Firefox + - Chrome + - Safari + - Microsoft Edge + - Opera + - Brave + - Other + validations: + required: false + - type: dropdown + id: os + attributes: + label: "OS" + description: What is the impacted environment ? + multiple: true + options: + - Windows + - MacOS + - Linux + - Other + validations: + required: false + - type: textarea + id: additional_information + attributes: + label: "Additional information" + description: Add any relevant information or context. + placeholder: + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..3ba13e0cec6cbbfd462e9ebf529dd2093148cd69 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1 @@ +blank_issues_enabled: false diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000000000000000000000000000000000000..32e66aeb110095017c9f48dee0326e54ea695de6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,49 @@ +name: "Feature Request" +description: Brainstorm and propose new features for the project +title: "[REQUEST] " +labels: ["enhancement"] +body: + - type: markdown + attributes: + value: | + *Please fill this form with as much information as possible.* + - type: textarea + id: reference_issues + attributes: + label: "Reference Issues" + description: Common issues + placeholder: "#Issues IDs" + validations: + required: false + - type: textarea + id: summary + attributes: + label: "Summary" + description: Provide a brief explanation of the feature + placeholder: Describe in a few lines your feature request + validations: + required: true + - type: textarea + id: basic_example + attributes: + label: "Basic Example" + description: Indicate here some basic examples of your feature. + placeholder: A few specific words about your feature request. + validations: + required: true + - type: textarea + id: drawbacks + attributes: + label: "Drawbacks" + description: What are the drawbacks/impacts of your feature request ? + placeholder: Identify the drawbacks and impacts while being neutral on your feature request + validations: + required: true + - type: textarea + id: additional_information + attributes: + label: "Additional information" + description: Add any additional information that you think is important for your feature request + placeholder: + validations: + required: false diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000000000000000000000000000000000..162d46670b030e8ede22df3727b633e7fd9b78d7 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,18 @@ +## Description + +- Please include a summary of the changes and the related issue. +- Fixes # (issue) + +## Type of change + +- [ ] New features (non-breaking change). +- [ ] Bug fix (non-breaking change). +- [ ] Breaking change (fix or feature that would cause existing functionality not to work as expected). + +## Checklist + +- [ ] I have performed a self-review of my code. +- [ ] I have added thorough tests if it is a core feature. +- [ ] There is a reference to the original bug report and related work. +- [ ] I have commented on my code, particularly in hard-to-understand areas. +- [ ] The feature is well documented. diff --git a/.github/workflows/auto-bump-and-release.yaml b/.github/workflows/auto-bump-and-release.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a9f7f9f059aef70be781b91937161643da5970ad --- /dev/null +++ b/.github/workflows/auto-bump-and-release.yaml @@ -0,0 +1,62 @@ +name: Auto Bump and Release + +on: + push: + branches: + - main + +jobs: + auto-bump-and-release: + runs-on: ubuntu-latest + steps: + - name: Clone the repo + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Update Application Version + id: update-version + uses: anothrNick/github-tag-action@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + WITH_V: true + DEFAULT_BUMP: patch + MAJOR_STRING_TOKEN: "bump:major" + MINOR_STRING_TOKEN: "bump:minor" + PATCH_STRING_TOKEN: "bump:patch" + - name: Create release for ${{ steps.update-version.outputs.new_tag }} + # need to repeat this if statement because Github Action doesn't support early + # stopping for steps + if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} + run: | + echo Create release folder + mkdir kotaemon-app + echo ${{ steps.update-version.outputs.new_tag }} > kotaemon-app/VERSION + cp LICENSE.txt kotaemon-app/ + cp flowsettings.py kotaemon-app/ + cp app.py kotaemon-app/ + cp .env.example kotaemon-app/.env + cp -r scripts kotaemon-app/ + mkdir -p kotaemon-app/libs/ktem/ktem/ + cp -r libs/ktem/ktem/assets kotaemon-app/libs/ktem/ktem/ + + tree kotaemon-app + zip -r kotaemon-app.zip kotaemon-app + - name: Release ${{ steps.update-version.outputs.new_tag }} + if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} + uses: softprops/action-gh-release@v2 + with: + files: kotaemon-app.zip + fail_on_unmatched_files: true + token: ${{ secrets.GITHUB_TOKEN }} + generate_release_notes: true + tag_name: ${{ steps.update-version.outputs.new_tag }} + make_latest: true + - name: Setup latest branch locally without switching current branch + if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} + run: git fetch origin latest:latest + - name: Update latest branch + if: ${{ steps.update-version.outputs.new_tag != steps.update-version.outputs.old_tag }} + run: | + git branch -f latest tags/${{ steps.update-version.outputs.new_tag }} + git checkout latest + git push -f -u origin latest diff --git a/.github/workflows/build-push-docker.yaml b/.github/workflows/build-push-docker.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ee151a9df8b837da51327a932c8e2905761dc6af --- /dev/null +++ b/.github/workflows/build-push-docker.yaml @@ -0,0 +1,103 @@ +name: Build and Push Docker Image + +on: + release: + types: + - created + + push: + tags: + - "v[0-9]+.[0-9]+.[0-9]+" + + workflow_dispatch: + +env: + REGISTRY: ghcr.io + +jobs: + build: + name: Build and push container + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + attestations: write + id-token: write + strategy: + matrix: + target: + - lite + - full + steps: + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: true + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + + - name: Set repository and image name + run: | + echo "FULL_IMAGE_NAME=${{ env.REGISTRY }}/${IMAGE_NAME,,}" >>${GITHUB_ENV} + env: + IMAGE_NAME: "${{ github.repository }}" + + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + with: + image: tonistiigi/binfmt:latest + platforms: arm64,arm + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FULL_IMAGE_NAME }} + tags: | + # branch + type=ref,event=branch,suffix=-${{ matrix.target }} + # semver with suffix for lite/full targets + type=semver,pattern={{version}},suffix=-${{ matrix.target }} + # latest tag with suffix for lite/full targets + type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, 'pre') }},suffix=-${{ matrix.target }} + flavor: | + # This is disabled here so we can use the raw form above + latest=false + # Suffix is not used here since there's no way to disable it above + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build docker image + uses: docker/build-push-action@v6 + with: + file: Dockerfile + context: . + push: true + platforms: linux/amd64, linux/arm64 + tags: | + ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + target: ${{ matrix.target }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/pr-lint.yaml b/.github/workflows/pr-lint.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e04533e998ac0e66f9f08d1942f1472bbf859bd --- /dev/null +++ b/.github/workflows/pr-lint.yaml @@ -0,0 +1,77 @@ +name: "Lint PR" + +on: + pull_request: + types: + - opened + - edited + - synchronize + +permissions: + pull-requests: write + +jobs: + pr-title: + name: Validate PR title + runs-on: ubuntu-latest + permissions: write-all + steps: + - uses: amannn/action-semantic-pull-request@v5 + id: lint_pr_title + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - uses: marocchino/sticky-pull-request-comment@v2 + # When the previous steps fails, the workflow would stop. By adding this + # condition you can continue the execution with the populated error message. + if: always() && (steps.lint_pr_title.outputs.error_message != null) + with: + header: pr-title-lint-error + message: | + Hey there and thank you for opening this pull request! ๐Ÿ‘‹๐Ÿผ + + We require pull request titles to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and it looks like your proposed title needs to be adjusted. + Details: + ``` + ${{ steps.lint_pr_title.outputs.error_message }} + ``` + + # Delete a previous comment when the issue has been resolved + - if: ${{ steps.lint_pr_title.outputs.error_message == null }} + uses: marocchino/sticky-pull-request-comment@v2 + with: + header: pr-title-lint-error + delete: true + + commitlint: + if: false # Disable this job for now + name: Validate commit messages + runs-on: ubuntu-latest + permissions: write-all + steps: + - uses: actions/checkout@v4 + - uses: wagoid/commitlint-github-action@v6 + id: commitlint + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + configFile: ./.commitlintrc + - uses: buildingcash/json-to-markdown-table-action@v1 + if: always() && (steps.commitlint.outcome != 'success') + id: table + with: + json: ${{ steps.commitlint.outputs.results }} + - uses: marocchino/sticky-pull-request-comment@v2 + if: always() && (steps.commitlint.outcome != 'success') + with: + header: commitlint-error + message: | + **All commits** in this PR need to follow the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) and [.commitlintrc](${{ github.server_url }}/${{ github.repository }}/blob/${{ github.head_ref || github.ref_name }}/.commitlintrc). + Details: + ${{ steps.table.outputs.table }} + + - if: ${{ steps.commitlint.outcome == 'success' }} + uses: marocchino/sticky-pull-request-comment@v2 + with: + header: commitlint-error + delete: true diff --git a/.github/workflows/style-check.yaml b/.github/workflows/style-check.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7312feba1f27200a0569778d4444b560e0148cd0 --- /dev/null +++ b/.github/workflows/style-check.yaml @@ -0,0 +1,20 @@ +name: style-check + +on: + pull_request: + branches: [main, develop] + push: + branches: [main, develop] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - name: Clone the repo + uses: actions/checkout@v4 + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: run pre-commit + uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/unit-test.yaml b/.github/workflows/unit-test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b6b54dc571e4adfdc97c8ac482cdbc9a99330bc8 --- /dev/null +++ b/.github/workflows/unit-test.yaml @@ -0,0 +1,109 @@ +name: unit-test + +on: + pull_request: + branches: [main] + push: + branches: [main] + +env: + THEFLOW_TEMP_PATH: ./tmp + +jobs: + unit-test: + # if: false # temporary disable this job due to legacy interface + #TODO: enable this job after the new interface is ready + if: ${{ !cancelled() }} + runs-on: ${{ matrix.os }} + timeout-minutes: 20 + defaults: + run: + shell: ${{ matrix.shell }} + strategy: + matrix: + python-version: ["3.10", "3.11"] + include: + - os: ubuntu-latest + shell: bash + ACTIVATE_ENV: ". env/bin/activate" + GITHUB_OUTPUT: "$GITHUB_OUTPUT" + # - os: windows-latest + # shell: pwsh + # ACTIVATE_ENV: env/Scripts/activate.ps1 + # GITHUB_OUTPUT: "$env:GITHUB_OUTPUT" + + name: unit testing with python ${{ matrix.python-version }} + steps: + - name: Clone the repo + uses: actions/checkout@v4 + with: + ref: ${{ github.event.pull_request.head.sha }} + + - name: Get Head Commit Message + id: get-head-commit-message + run: echo "message=$(git show -s --format=%s)" | tee -a ${{ matrix.GITHUB_OUTPUT }} + + - name: Check ignore caching + id: check-ignore-cache + run: | + ignore_cache=${{ contains(steps.get-head-commit-message.outputs.message, '[ignore cache]') }} + echo "check=$ignore_cache" | tee -a ${{ matrix.GITHUB_OUTPUT }} + + - name: Set up Python ${{ matrix.python-version }} on ${{ runner.os }} + uses: actions/setup-python@v4 + id: setup_python + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + + - name: Get cache key + id: get-cache-key + run: | + pip install "setuptools-git-versioning>=2.0,<3" + package_version=$(setuptools-git-versioning) + cache_key="${{ runner.os }}-py${{ matrix.python-version }}-v${package_version}" + echo "key=$cache_key" | tee -a ${{ matrix.GITHUB_OUTPUT }} + + - name: Try to restore dependencies from ${{ steps.get-cache-key.outputs.key }} + id: restore-dependencies + if: steps.check-ignore-cache.outputs.check != 'true' + uses: actions/cache/restore@v3 + with: + path: ${{ env.pythonLocation }} + key: ${{ steps.get-cache-key.outputs.key }} + # could using cache of previous ver to reuse unchanged packages + restore-keys: ${{ runner.os }}-py${{ matrix.python-version }} + + - name: Check cache hit + id: check-cache-hit + run: | + echo "cache-hit=${{ steps.restore-dependencies.outputs.cache-hit }}" + echo "cache-matched-key=${{ steps.restore-dependencies.outputs.cache-matched-key }}" + cache_hit=${{ steps.restore-dependencies.outputs.cache-primary-key == steps.restore-dependencies.outputs.cache-matched-key }} + echo "check=$cache_hit" | tee -a ${{ matrix.GITHUB_OUTPUT }} + + - name: Install additional dependencies (if any) + run: | + python -m pip install --upgrade pip + cd libs/kotaemon + pip install -U --upgrade-strategy eager -e .[all] + + - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }} + if: | + steps.check-ignore-cache.outputs.check != 'true' && + steps.check-cache-hit.outputs.check != 'true' + uses: actions/cache/save@v3 + with: + path: ${{ env.pythonLocation }} + key: ${{ steps.restore-dependencies.outputs.cache-primary-key }} + + - name: Install OS-based packages + run: | + sudo apt update -qqy + sudo apt install -y poppler-utils libpoppler-dev tesseract-ocr + + - name: Test kotaemon with pytest + run: | + pip show pytest + cd libs/kotaemon + pytest diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..80e557d158de79c5b3b6ecb6be833c7edb18cbb9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,479 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm +# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm + +activate* +activate/* +kotaemon-env* +.env + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + +### Linux ### + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# PDF files +*.pdf +!libs/kotaemon/tests/resources/*.pdf + +.theflow/ + +# End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm +*.py[coid] + +logs/ +.gitsecret/keys/random_seed +!*.secret +.envrc +.env + +S.gpg-agent* +.vscode/settings.json +examples/example1/assets +storage/* + +# Conda and env storages +*install_dir/ +doc_env/ + +# application data +ktem_app_data/ +gradio_tmp/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..514991df867e87e986f21225c171409e3e9d1fe5 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,69 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: check-yaml + args: ["--unsafe"] + - id: check-toml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: mixed-line-ending + - id: detect-aws-credentials + args: ["--allow-missing-credentials"] + - id: detect-private-key + - id: check-added-large-files + args: ["--maxkb=750"] + - id: debug-statements + - repo: https://github.com/ambv/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] + language_version: python3.10 + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + args: ["--max-line-length", "88", "--extend-ignore", "E203"] + - repo: https://github.com/myint/autoflake + rev: v1.4 + hooks: + - id: autoflake + args: + [ + "--in-place", + "--remove-unused-variables", + "--remove-all-unused-imports", + "--ignore-init-module-imports", + "--exclude=tests/*", + ] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.7.1 + hooks: + - id: prettier + types_or: [markdown, yaml] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.7.1" + hooks: + - id: mypy + additional_dependencies: + [ + types-PyYAML==6.0.12.11, + "types-requests", + "sqlmodel", + "types-Markdown", + types-tzlocal, + ] + args: ["--check-untyped-defs", "--ignore-missing-imports"] + exclude: "^templates/" + - repo: https://github.com/codespell-project/codespell + rev: v2.2.4 + hooks: + - id: codespell + additional_dependencies: + - tomli diff --git a/.python-version b/.python-version new file mode 100644 index 0000000000000000000000000000000000000000..c8cfe3959183f8e9a50f83f54cd723f2dc9c252d --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..afaa98fe51e519ae72e7f62ae878fc4ae8ef3530 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +- Demonstrating empathy and kindness toward other people +- Being respectful of differing opinions, viewpoints, and experiences +- Giving and gracefully accepting constructive feedback +- Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +- Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +- The use of sexualized language or imagery, and sexual attention or + advances of any kind +- Trolling, insulting or derogatory comments, and personal or political attacks +- Public or private harassment +- Publishing others' private information, such as a physical or email + address, without their explicit permission +- Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..959faf88f825641ecfc5a4d09517c80f6ce6a6d7 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,115 @@ +# Contributing to Kotaemon + +Welcome ๐Ÿ‘‹ to the Kotaemon project! We're thrilled that you're interested in contributing. Whether you're fixing bugs, adding new features, or improving documentation, your efforts are highly appreciated. This guide aims to help you get started with contributing to Kotaemon. + + + + + +### Table of Contents + +1. [๐Ÿ“– Code of Conduct](#code-of-conduct) +2. [๐Ÿ” Contributing via Pull Requests](#contributing-via-pull-requests) +3. [๐Ÿ“ฅ Opening an Issue](#-opening-an-issue) +4. [๐Ÿ“ Commit Messages](#-commit-messages) +5. [๐Ÿงพ License](#-license) + +## ๐Ÿ“– Code of Conduct + +Please review our [code of conduct](./CODE_OF_CONDUCT.md), which is in effect at all times. We expect everyone who contributes to this project to honor it. + +## ๐Ÿ” Contributing via Pull Requests + +1. [**Fork the repository**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo): Click on the [Fork](https://github.com/Cinnamon/kotaemon/fork) button on the repository's page to create a copy of Kotaemon under your GitHub account. + +2. [**Clone your code**](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository): Clone your forked repository to your local machine. + +3. [**Create new branch**](https://docs.github.com/en/desktop/making-changes-in-a-branch/managing-branches-in-github-desktop): Create a new branch in your forked repo with a descriptive name that reflects your changes. + +```sh +git checkout -b descriptive-name-for-your-changes +``` + +4. **Setup the development environment**: If you are working on the code, make sure to install the necessary dependencies for development + +```sh +pip install -e "libs/kotaemon[dev] +``` + +5. **Make your changes**: Ensure your code follows the project's coding style and passes all test cases. + + - Check the coding style + + ```sh + pre-commit run --all-files + ``` + + - Run the tests + + ```sh + pytest libs/kotaemon/tests/ + ``` + +6. [**Commit your changes**](https://docs.github.com/en/desktop/making-changes-in-a-branch/committing-and-reviewing-changes-to-your-project-in-github-desktop): Once you are done with your changes, add and commit them with clear messages. + +```sh +git add your_changes.py +git commit -m "clear message described your changes." +git push -u origin descriptive-name-for-your-changes +``` + +7. [**Create a pull request**](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request): When you are satisfied with your changes, [submit a pull request](https://github.com/Cinnamon/kotaemon/compare) from your forked repository to Kotaemon repository. In the pull request, provide a clear description of your changes and any related issues. For the title of the pull request, please refer to our [commit messages convention](#-commit-messages). + +8. **Wait for reviews**: Wait for the maintainers to review your pull request. If everything is okay, your changes will be merged into the Kotaemon project. + +### GitHub Actions CI Tests + +All pull requests must pass the [GitHub Actions Continuous Integration (CI)](https://docs.github.com/en/actions/about-github-actions/about-continuous-integration-with-github-actions) tests before they can be merged. These tests include coding-style checks, PR title validation, unit tests, etc. to ensure that your changes meet the project's quality standards. Please review and fix any CI failures that arise. + +## ๐Ÿ“ฅ Opening an Issue + +Before [creating an issues](https://github.com/Cinnamon/kotaemon/issues/new/choose), search through existing issues to ensure you are not opening a duplicate. If you are reporting a bug or issue, please provide a reproducible example to help us quickly identify the problem. + +## ๐Ÿ“ Commit Messages + +### Overview + +We use [Angular convention](https://www.conventionalcommits.org/en/) for commit messages to maintain consistency and clarity in our project history. Please take a moment to familiarize yourself with this convention before making your first commit. + +_For the sake of simplicity, we use [squashing merge](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/incorporating-changes-from-a-pull-request/about-pull-request-merges#squash-and-merge-your-commits) with pull requests. Therefore, if you contribute via a pull request, just make sure your PR's title, instead of the whole commits, follows this convention._ + +Commit format: + +```sh + (): + + + +