Augmented-Retrieval-qa-ChatGPT

Runtime error

App Files Files Community

hlydecker commited on Jun 6, 2023

Commit

1ce95c4

0 Parent(s):

Duplicate from hlydecker/Augmented-Retrieval-qa-ChatGPT

Browse files

Files changed (25) hide show

.gitattributes +34 -0
.gitignore +7 -0
README.md +15 -0
__init__.py +0 -0
requirements.txt +0 -0
static/__init__.py +0 -0
static/mini_nttdata.jpg +0 -0
streamlit_langchain_chat/__init__.py +1 -0
streamlit_langchain_chat/__version__.py +1 -0
streamlit_langchain_chat/constants.py +53 -0
streamlit_langchain_chat/customized_langchain/__init__.py +10 -0
streamlit_langchain_chat/customized_langchain/docstore/__init__.py +7 -0
streamlit_langchain_chat/customized_langchain/docstore/in_memory.py +27 -0
streamlit_langchain_chat/customized_langchain/indexes/__init__.py +7 -0
streamlit_langchain_chat/customized_langchain/indexes/graph.py +20 -0
streamlit_langchain_chat/customized_langchain/llms/__init__.py +1 -0
streamlit_langchain_chat/customized_langchain/llms/openai.py +708 -0
streamlit_langchain_chat/customized_langchain/vectorstores/__init__.py +8 -0
streamlit_langchain_chat/customized_langchain/vectorstores/faiss.py +100 -0
streamlit_langchain_chat/customized_langchain/vectorstores/pinecone.py +79 -0
streamlit_langchain_chat/dataset.py +740 -0
streamlit_langchain_chat/inputs/__init__.py +0 -0
streamlit_langchain_chat/prompts.py +91 -0
streamlit_langchain_chat/streamlit_app.py +561 -0
streamlit_langchain_chat/utils.py +52 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+venv*/
+tempDir/
+.idea/
+*.env
+*.pkl
+*.pickle
+*testing*.py

README.md ADDED Viewed

	@@ -0,0 +1,15 @@

+---
+title: Augmented Retrieval Qa ChatGPT
+emoji: 🚀
+colorFrom: blue
+colorTo: blue
+sdk: streamlit
+sdk_version: 1.19.0
+app_file: streamlit_langchain_chat/streamlit_app.py
+pinned: false
+python_version: 3.10.4
+license: cc-by-nc-sa-4.0
+duplicated_from: hlydecker/Augmented-Retrieval-qa-ChatGPT
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__init__.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

Binary file (5.03 kB). View file

static/__init__.py ADDED Viewed

File without changes

static/mini_nttdata.jpg ADDED Viewed

streamlit_langchain_chat/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

streamlit_langchain_chat/__version__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __VERSION__ = "1.0.4"

streamlit_langchain_chat/constants.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from pathlib import Path
+from PIL import Image
+# from dotenv import load_dotenv, find_dotenv  # pip install python-dotenv==1.0.0
+from __version__ import __VERSION__ as APP_VERSION
+_SCRIPT_PATH = Path(__file__).absolute()
+PARENT_APP_DIR = _SCRIPT_PATH.parent
+TEMP_DIR = PARENT_APP_DIR / 'tempDir'
+ROOT_DIR = PARENT_APP_DIR.parent
+STATIC_DIR = ROOT_DIR / 'static'
+# _env_file_path = find_dotenv(str(CODE_DIR / '.env'))  # Check if this path is correct
+# if _env_file_path:
+#     load_dotenv(_env_file_path)
+ST_CONFIG = {
+    "page_title": "NTT Data - Chat Q&A",
+    # "page_icon": Image.open(STATIC_DIR / "mini_nttdata.jpg"),
+}
+OPERATING_MODE = "debug"  # debug, preproduction, production
+REUSE_ANSWERS = False
+LOAD_INDEX_LOCALLY = False
+SAVE_INDEX_LOCALLY = False
+# x$ per 1000 tokens
+PRICES = {
+    'text-embedding-ada-002': 0.0004,
+    'text-davinci-003': 0.02,
+    'gpt-3': 0.002,
+    'gpt-4': 0.06,  # 8K context
+}
+SOURCES_IDS = {
+    # "Without source. Only chat": 4,
+    "local files": 1,
+    "urls": 3
+}
+TYPE_IDS = {
+    "MSF Azure OpenAI Service": 1,
+    "OpenAI": 2,
+}
+INDEX_IDS = {
+    "FAISS": 1,
+    "Pinecode": 2,
+}

streamlit_langchain_chat/customized_langchain/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from streamlit_langchain_chat.customized_langchain.docstore.in_memory import InMemoryDocstore
+from streamlit_langchain_chat.customized_langchain.vectorstores import FAISS
+from streamlit_langchain_chat.customized_langchain.vectorstores import Pinecone
+__all__ = [
+    "FAISS",
+    "InMemoryDocstore",
+    "Pinecone",
+]

streamlit_langchain_chat/customized_langchain/docstore/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Wrappers on top of docstores."""
+from streamlit_langchain_chat.customized_langchain.docstore.in_memory import InMemoryDocstore
+__all__ = [
+    "InMemoryDocstore",
+]

streamlit_langchain_chat/customized_langchain/docstore/in_memory.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Simple in memory docstore in the form of a dict."""
+from typing import Dict, Union
+from langchain.docstore.base import AddableMixin, Docstore
+from langchain.docstore.document import Document
+class InMemoryDocstore(Docstore, AddableMixin):
+    """Simple in memory docstore in the form of a dict."""
+    def __init__(self, dict_: Dict[str, Document]):
+        """Initialize with dict."""
+        self.dict_ = dict_
+    def add(self, texts: Dict[str, Document]) -> None:
+        """Add texts to in memory dictionary."""
+        overlapping = set(texts).intersection(self.dict_)
+        if overlapping:
+            raise ValueError(f"Tried to add ids that already exist: {overlapping}")
+        self.dict_ = dict(self.dict_, **texts)
+    def search(self, search: str) -> Union[str, Document]:
+        """Search via direct lookup."""
+        if search not in self.dict_:
+            return f"ID {search} not found."
+        else:
+            return self.dict_[search]

streamlit_langchain_chat/customized_langchain/indexes/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from streamlit_langchain_chat.customized_langchain.indexes.graph import GraphIndexCreator
+# from streamlit_langchain_chat.customized_langchain.vectorstore import VectorstoreIndexCreator
+__all__ = [
+    "GraphIndexCreator",
+    # "VectorstoreIndexCreator"
+]

streamlit_langchain_chat/customized_langchain/indexes/graph.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from typing import List
+from langchain.indexes.graph import *
+from langchain.indexes.graph import GraphIndexCreator as OriginalGraphIndexCreator
+class GraphIndexCreator(OriginalGraphIndexCreator):
+    def from_texts(self, texts: List[str]) -> NetworkxEntityGraph:
+        """Create graph index from text."""
+        if self.llm is None:
+            raise ValueError("llm should not be None")
+        graph = self.graph_type()
+        chain = LLMChain(llm=self.llm, prompt=KNOWLEDGE_TRIPLE_EXTRACTION_PROMPT)
+        for text in texts:
+            output = chain.predict(text=text)
+            knowledge = parse_triples(output)
+            for triple in knowledge:
+                graph.add_triple(triple)
+        return graph

streamlit_langchain_chat/customized_langchain/llms/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from streamlit_langchain_chat.customized_langchain.llms.openai import AzureOpenAI, OpenAI, OpenAIChat, AzureOpenAIChat

streamlit_langchain_chat/customized_langchain/llms/openai.py ADDED Viewed

	@@ -0,0 +1,708 @@

+"""Wrapper around OpenAI APIs."""
+from __future__ import annotations
+import logging
+import sys
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generator,
+    List,
+    Mapping,
+    Optional,
+    Set,
+    Tuple,
+    Union,
+)
+from pydantic import BaseModel, Extra, Field, root_validator
+from tenacity import (
+    before_sleep_log,
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+from langchain.llms.base import BaseLLM
+from langchain.schema import Generation, LLMResult
+from langchain.utils import get_from_dict_or_env
+logger = logging.getLogger(__name__)
+def update_token_usage(
+    keys: Set[str], response: Dict[str, Any], token_usage: Dict[str, Any]
+) -> None:
+    """Update token usage."""
+    _keys_to_use = keys.intersection(response["usage"])
+    for _key in _keys_to_use:
+        if _key not in token_usage:
+            token_usage[_key] = response["usage"][_key]
+        else:
+            token_usage[_key] += response["usage"][_key]
+def _update_response(response: Dict[str, Any], stream_response: Dict[str, Any]) -> None:
+    """Update response from the stream response."""
+    response["choices"][0]["text"] += stream_response["choices"][0]["text"]
+    response["choices"][0]["finish_reason"] = stream_response["choices"][0][
+        "finish_reason"
+    ]
+    response["choices"][0]["logprobs"] = stream_response["choices"][0]["logprobs"]
+def _streaming_response_template() -> Dict[str, Any]:
+    return {
+        "choices": [
+            {
+                "text": "",
+                "finish_reason": None,
+                "logprobs": None,
+            }
+        ]
+    }
+def _create_retry_decorator(llm: Union[BaseOpenAI, OpenAIChat]) -> Callable[[Any], Any]:
+    import openai
+    min_seconds = 4
+    max_seconds = 10
+    # Wait 2^x * 1 second between each retry starting with
+    # 4 seconds, then up to 10 seconds, then 10 seconds afterwards
+    return retry(
+        reraise=True,
+        stop=stop_after_attempt(llm.max_retries),
+        wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds),
+        retry=(
+            retry_if_exception_type(openai.error.Timeout)
+            | retry_if_exception_type(openai.error.APIError)
+            | retry_if_exception_type(openai.error.APIConnectionError)
+            | retry_if_exception_type(openai.error.RateLimitError)
+            | retry_if_exception_type(openai.error.ServiceUnavailableError)
+        ),
+        before_sleep=before_sleep_log(logger, logging.WARNING),
+    )
+def completion_with_retry(llm: Union[BaseOpenAI, OpenAIChat], **kwargs: Any) -> Any:
+    """Use tenacity to retry the completion call."""
+    retry_decorator = _create_retry_decorator(llm)
+    @retry_decorator
+    def _completion_with_retry(**kwargs: Any) -> Any:
+        return llm.client.create(**kwargs)
+    return _completion_with_retry(**kwargs)
+async def acompletion_with_retry(
+    llm: Union[BaseOpenAI, OpenAIChat], **kwargs: Any
+) -> Any:
+    """Use tenacity to retry the async completion call."""
+    retry_decorator = _create_retry_decorator(llm)
+    @retry_decorator
+    async def _completion_with_retry(**kwargs: Any) -> Any:
+        # Use OpenAI's async api https://github.com/openai/openai-python#async-api
+        return await llm.client.acreate(**kwargs)
+    return await _completion_with_retry(**kwargs)
+class BaseOpenAI(BaseLLM, BaseModel):
+    """Wrapper around OpenAI large language models.
+    To use, you should have the ``openai`` python package installed, and the
+    environment variable ``OPENAI_API_KEY`` set with your API key.
+    Any parameters that are valid to be passed to the openai.create call can be passed
+    in, even if not explicitly saved on this class.
+    Example:
+        .. code-block:: python
+            from langchain.llms import OpenAI
+            openai = OpenAI(model_name="text-davinci-003")
+    """
+    client: Any  #: :meta private:
+    model_name: str = "text-davinci-003"
+    """Model name to use."""
+    temperature: float = 0.7
+    """What sampling temperature to use."""
+    max_tokens: int = 256
+    """The maximum number of tokens to generate in the completion.
+    -1 returns as many tokens as possible given the prompt and
+    the models maximal context size."""
+    top_p: float = 1
+    """Total probability mass of tokens to consider at each step."""
+    frequency_penalty: float = 0
+    """Penalizes repeated tokens according to frequency."""
+    presence_penalty: float = 0
+    """Penalizes repeated tokens."""
+    n: int = 1
+    """How many completions to generate for each prompt."""
+    best_of: int = 1
+    """Generates best_of completions server-side and returns the "best"."""
+    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """Holds any model parameters valid for `create` call not explicitly specified."""
+    openai_api_key: Optional[str] = None
+    batch_size: int = 20
+    """Batch size to use when passing multiple documents to generate."""
+    request_timeout: Optional[Union[float, Tuple[float, float]]] = None
+    """Timeout for requests to OpenAI completion API. Default is 600 seconds."""
+    logit_bias: Optional[Dict[str, float]] = Field(default_factory=dict)
+    """Adjust the probability of specific tokens being generated."""
+    max_retries: int = 6
+    """Maximum number of retries to make when generating."""
+    streaming: bool = False
+    """Whether to stream the results or not."""
+    def __new__(cls, **data: Any) -> Union[OpenAIChat, BaseOpenAI]:  # type: ignore
+        """Initialize the OpenAI object."""
+        if data.get("model_name", "").startswith("gpt-3.5-turbo"):
+            return OpenAIChat(**data)
+        return super().__new__(cls)
+    class Config:
+        """Configuration for this pydantic object."""
+        extra = Extra.ignore
+    @root_validator(pre=True, allow_reuse=True)
+    def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """Build extra kwargs from additional params that were passed in."""
+        all_required_field_names = {field.alias for field in cls.__fields__.values()}
+        extra = values.get("model_kwargs", {})
+        for field_name in list(values):
+            if field_name not in all_required_field_names:
+                if field_name in extra:
+                    raise ValueError(f"Found {field_name} supplied twice.")
+                logger.warning(
+                    f"""WARNING! {field_name} is not default parameter.
+                    {field_name} was transfered to model_kwargs.
+                    Please confirm that {field_name} is what you intended."""
+                )
+                extra[field_name] = values.pop(field_name)
+        values["model_kwargs"] = extra
+        return values
+    @root_validator(allow_reuse=True)
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        openai_api_key = get_from_dict_or_env(
+            values, "openai_api_key", "OPENAI_API_KEY"
+        )
+        try:
+            import openai
+            openai.api_key = openai_api_key
+            values["client"] = openai.Completion
+        except ImportError:
+            raise ValueError(
+                "Could not import openai python package. "
+                "Please it install it with `pip install openai`."
+            )
+        if values["streaming"] and values["n"] > 1:
+            raise ValueError("Cannot stream results when n > 1.")
+        if values["streaming"] and values.get("best_of") and values["best_of"] > 1:
+            raise ValueError("Cannot stream results when best_of > 1.")
+        return values
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling OpenAI API."""
+        normal_params = {
+            "temperature": self.temperature,
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty,
+            "n": self.n,
+            # "best_of": self.best_of,
+            "request_timeout": self.request_timeout,
+            "logit_bias": self.logit_bias,
+        }
+        return {**normal_params, **self.model_kwargs}
+    def _generate(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        """Call out to OpenAI's endpoint with k unique prompts.
+        Args:
+            prompts: The prompts to pass into the model.
+            stop: Optional list of stop words to use when generating.
+        Returns:
+            The full LLM output.
+        Example:
+            .. code-block:: python
+                response = openai.generate(["Tell me a joke."])
+        """
+        # TODO: write a unit test for this
+        params = self._invocation_params
+        sub_prompts = self.get_sub_prompts(params, prompts, stop)
+        choices = []
+        token_usage: Dict[str, int] = {}
+        # Get the token usage from the response.
+        # Includes prompt, completion, and total tokens used.
+        _keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
+        for _prompts in sub_prompts:
+            if self.streaming:
+                if len(_prompts) > 1:
+                    raise ValueError("Cannot stream results with multiple prompts.")
+                params["stream"] = True
+                response = _streaming_response_template()
+                for stream_resp in completion_with_retry(
+                    self, prompt=_prompts, **params
+                ):
+                    self.callback_manager.on_llm_new_token(
+                        stream_resp["choices"][0]["text"],
+                        verbose=self.verbose,
+                        logprobs=stream_resp["choices"][0]["logprobs"],
+                    )
+                    _update_response(response, stream_resp)
+                choices.extend(response["choices"])
+            else:
+                response = completion_with_retry(self, prompt=_prompts, **params)
+                choices.extend(response["choices"])
+            if not self.streaming:
+                # Can't update token usage if streaming
+                update_token_usage(_keys, response, token_usage)
+        return self.create_llm_result(choices, prompts, token_usage)
+    async def _agenerate(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        """Call out to OpenAI's endpoint async with k unique prompts."""
+        params = self._invocation_params
+        sub_prompts = self.get_sub_prompts(params, prompts, stop)
+        choices = []
+        token_usage: Dict[str, int] = {}
+        # Get the token usage from the response.
+        # Includes prompt, completion, and total tokens used.
+        _keys = {"completion_tokens", "prompt_tokens", "total_tokens"}
+        for _prompts in sub_prompts:
+            if self.streaming:
+                if len(_prompts) > 1:
+                    raise ValueError("Cannot stream results with multiple prompts.")
+                params["stream"] = True
+                response = _streaming_response_template()
+                async for stream_resp in await acompletion_with_retry(
+                    self, prompt=_prompts, **params
+                ):
+                    if self.callback_manager.is_async:
+                        await self.callback_manager.on_llm_new_token(
+                            stream_resp["choices"][0]["text"],
+                            verbose=self.verbose,
+                            logprobs=stream_resp["choices"][0]["logprobs"],
+                        )
+                    else:
+                        self.callback_manager.on_llm_new_token(
+                            stream_resp["choices"][0]["text"],
+                            verbose=self.verbose,
+                            logprobs=stream_resp["choices"][0]["logprobs"],
+                        )
+                    _update_response(response, stream_resp)
+                choices.extend(response["choices"])
+            else:
+                response = await acompletion_with_retry(self, prompt=_prompts, **params)
+                choices.extend(response["choices"])
+            if not self.streaming:
+                # Can't update token usage if streaming
+                update_token_usage(_keys, response, token_usage)
+        return self.create_llm_result(choices, prompts, token_usage)
+    def get_sub_prompts(
+        self,
+        params: Dict[str, Any],
+        prompts: List[str],
+        stop: Optional[List[str]] = None,
+    ) -> List[List[str]]:
+        """Get the sub prompts for llm call."""
+        if stop is not None:
+            if "stop" in params:
+                raise ValueError("`stop` found in both the input and default params.")
+            params["stop"] = stop
+        if params["max_tokens"] == -1:
+            if len(prompts) != 1:
+                raise ValueError(
+                    "max_tokens set to -1 not supported for multiple inputs."
+                )
+            params["max_tokens"] = self.max_tokens_for_prompt(prompts[0])
+        sub_prompts = [
+            prompts[i : i + self.batch_size]
+            for i in range(0, len(prompts), self.batch_size)
+        ]
+        return sub_prompts
+    def create_llm_result(
+        self, choices: Any, prompts: List[str], token_usage: Dict[str, int]
+    ) -> LLMResult:
+        """Create the LLMResult from the choices and prompts."""
+        generations = []
+        for i, _ in enumerate(prompts):
+            sub_choices = choices[i * self.n : (i + 1) * self.n]
+            generations.append(
+                [
+                    Generation(
+                        text=choice["text"],
+                        generation_info=dict(
+                            finish_reason=choice.get("finish_reason"),
+                            logprobs=choice.get("logprobs"),
+                        ),
+                    )
+                    for choice in sub_choices
+                ]
+            )
+        return LLMResult(
+            generations=generations, llm_output={"token_usage": token_usage}
+        )
+    def stream(self, prompt: str, stop: Optional[List[str]] = None) -> Generator:
+        """Call OpenAI with streaming flag and return the resulting generator.
+        BETA: this is a beta feature while we figure out the right abstraction.
+        Once that happens, this interface could change.
+        Args:
+            prompt: The prompts to pass into the model.
+            stop: Optional list of stop words to use when generating.
+        Returns:
+            A generator representing the stream of tokens from OpenAI.
+        Example:
+            .. code-block:: python
+                generator = openai.stream("Tell me a joke.")
+                for token in generator:
+                    yield token
+        """
+        params = self.prep_streaming_params(stop)
+        generator = self.client.create(prompt=prompt, **params)
+        return generator
+    def prep_streaming_params(self, stop: Optional[List[str]] = None) -> Dict[str, Any]:
+        """Prepare the params for streaming."""
+        params = self._invocation_params
+        if params.get('best_of') and params["best_of"] != 1:
+            raise ValueError("OpenAI only supports best_of == 1 for streaming")
+        if stop is not None:
+            if "stop" in params:
+                raise ValueError("`stop` found in both the input and default params.")
+            params["stop"] = stop
+        params["stream"] = True
+        return params
+    @property
+    def _invocation_params(self) -> Dict[str, Any]:
+        """Get the parameters used to invoke the model."""
+        return self._default_params
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        """Get the identifying parameters."""
+        return {**{"model_name": self.model_name}, **self._default_params}
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "openai"
+    def get_num_tokens(self, text: str) -> int:
+        """Calculate num tokens with tiktoken package."""
+        # tiktoken NOT supported for Python 3.8 or below
+        if sys.version_info[1] <= 8:
+            return super().get_num_tokens(text)
+        try:
+            import tiktoken
+        except ImportError:
+            raise ValueError(
+                "Could not import tiktoken python package. "
+                "This is needed in order to calculate get_num_tokens. "
+                "Please it install it with `pip install tiktoken`."
+            )
+        encoder = "gpt2"
+        if self.model_name in ("text-davinci-003", "text-davinci-002"):
+            encoder = "p50k_base"
+        if self.model_name.startswith("code"):
+            encoder = "p50k_base"
+        # create a GPT-3 encoder instance
+        enc = tiktoken.get_encoding(encoder)
+        # encode the text using the GPT-3 encoder
+        tokenized_text = enc.encode(text)
+        # calculate the number of tokens in the encoded text
+        return len(tokenized_text)
+    def modelname_to_contextsize(self, modelname: str) -> int:
+        """Calculate the maximum number of tokens possible to generate for a model.
+        text-davinci-003: 4,097 tokens
+        text-curie-001: 2,048 tokens
+        text-babbage-001: 2,048 tokens
+        text-ada-001: 2,048 tokens
+        code-davinci-002: 8,000 tokens
+        code-cushman-001: 2,048 tokens
+        Args:
+            modelname: The modelname we want to know the context size for.
+        Returns:
+            The maximum context size
+        Example:
+            .. code-block:: python
+                max_tokens = openai.modelname_to_contextsize("text-davinci-003")
+        """
+        if modelname == "text-davinci-003":
+            return 4097
+        elif modelname == "text-curie-001":
+            return 2048
+        elif modelname == "text-babbage-001":
+            return 2048
+        elif modelname == "text-ada-001":
+            return 2048
+        elif modelname == "code-davinci-002":
+            return 8000
+        elif modelname == "code-cushman-001":
+            return 2048
+        else:
+            return 4097
+    def max_tokens_for_prompt(self, prompt: str) -> int:
+        """Calculate the maximum number of tokens possible to generate for a prompt.
+        Args:
+            prompt: The prompt to pass into the model.
+        Returns:
+            The maximum number of tokens to generate for a prompt.
+        Example:
+            .. code-block:: python
+                max_tokens = openai.max_token_for_prompt("Tell me a joke.")
+        """
+        num_tokens = self.get_num_tokens(prompt)
+        # get max context size for model by name
+        max_size = self.modelname_to_contextsize(self.model_name)
+        return max_size - num_tokens
+class OpenAI(BaseOpenAI):
+    """Generic OpenAI class that uses model name."""
+    @property
+    def _invocation_params(self) -> Dict[str, Any]:
+        return {**{"model": self.model_name}, **super()._invocation_params}
+class AzureOpenAI(BaseOpenAI):
+    """Azure specific OpenAI class that uses deployment name."""
+    deployment_name: str = ""
+    """Deployment name to use."""
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        return {
+            **{"deployment_name": self.deployment_name},
+            **super()._identifying_params,
+        }
+    @property
+    def _invocation_params(self) -> Dict[str, Any]:
+        return {**{"engine": self.deployment_name}, **super()._invocation_params}
+class OpenAIChat(BaseLLM, BaseModel):
+    """Wrapper around OpenAI Chat large language models.
+    To use, you should have the ``openai`` python package installed, and the
+    environment variable ``OPENAI_API_KEY`` set with your API key.
+    Any parameters that are valid to be passed to the openai.create call can be passed
+    in, even if not explicitly saved on this class.
+    Example:
+        .. code-block:: python
+            from langchain.llms import OpenAIChat
+            openaichat = OpenAIChat(model_name="gpt-3.5-turbo")
+    """
+    client: Any  #: :meta private:
+    model_name: str = "gpt-3.5-turbo"
+    """Model name to use."""
+    model_kwargs: Dict[str, Any] = Field(default_factory=dict)
+    """Holds any model parameters valid for `create` call not explicitly specified."""
+    openai_api_key: Optional[str] = None
+    max_retries: int = 6
+    """Maximum number of retries to make when generating."""
+    prefix_messages: List = Field(default_factory=list)
+    """Series of messages for Chat input."""
+    streaming: bool = False
+    """Whether to stream the results or not."""
+    class Config:
+        """Configuration for this pydantic object."""
+        extra = Extra.ignore
+    @root_validator(pre=True, allow_reuse=True)
+    def build_extra(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """Build extra kwargs from additional params that were passed in."""
+        all_required_field_names = {field.alias for field in cls.__fields__.values()}
+        extra = values.get("model_kwargs", {})
+        for field_name in list(values):
+            if field_name not in all_required_field_names:
+                if field_name in extra:
+                    raise ValueError(f"Found {field_name} supplied twice.")
+                extra[field_name] = values.pop(field_name)
+        values["model_kwargs"] = extra
+        return values
+    @root_validator(allow_reuse=True)
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        openai_api_key = get_from_dict_or_env(
+            values, "openai_api_key", "OPENAI_API_KEY"
+        )
+        try:
+            import openai
+            openai.api_key = openai_api_key
+        except ImportError:
+            raise ValueError(
+                "Could not import openai python package. "
+                "Please it install it with `pip install openai`."
+            )
+        try:
+            values["client"] = openai.ChatCompletion
+        except AttributeError:
+            raise ValueError(
+                "`openai` has no `ChatCompletion` attribute, this is likely "
+                "due to an old version of the openai package. Try upgrading it "
+                "with `pip install --upgrade openai`."
+            )
+        return values
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling OpenAI API."""
+        return self.model_kwargs
+    def _get_chat_params(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> Tuple:
+        if len(prompts) > 1:
+            raise ValueError(
+                f"OpenAIChat currently only supports single prompt, got {prompts}"
+            )
+        messages = self.prefix_messages + [{"role": "user", "content": prompts[0]}]
+        params: Dict[str, Any] = {**{"model": self.model_name}, **self._default_params}
+        if stop is not None:
+            if "stop" in params:
+                raise ValueError("`stop` found in both the input and default params.")
+            params["stop"] = stop
+        return messages, params
+    def _generate(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        messages, params = self._get_chat_params(prompts, stop)
+        if self.streaming:
+            response = ""
+            params["stream"] = True
+            for stream_resp in completion_with_retry(self, messages=messages, **params):
+                token = stream_resp["choices"][0]["delta"].get("content", "")
+                response += token
+                self.callback_manager.on_llm_new_token(
+                    token,
+                    verbose=self.verbose,
+                )
+            return LLMResult(
+                generations=[[Generation(text=response)]],
+            )
+        else:
+            full_response = completion_with_retry(self, messages=messages, **params)
+            return LLMResult(
+                generations=[
+                    [Generation(text=full_response["choices"][0]["message"]["content"])]
+                ],
+                llm_output={"token_usage": full_response["usage"]},
+            )
+    async def _agenerate(
+        self, prompts: List[str], stop: Optional[List[str]] = None
+    ) -> LLMResult:
+        messages, params = self._get_chat_params(prompts, stop)
+        if self.streaming:
+            response = ""
+            params["stream"] = True
+            async for stream_resp in await acompletion_with_retry(
+                self, messages=messages, **params
+            ):
+                token = stream_resp["choices"][0]["delta"].get("content", "")
+                response += token
+                if self.callback_manager.is_async:
+                    await self.callback_manager.on_llm_new_token(
+                        token,
+                        verbose=self.verbose,
+                    )
+                else:
+                    self.callback_manager.on_llm_new_token(
+                        token,
+                        verbose=self.verbose,
+                    )
+            return LLMResult(
+                generations=[[Generation(text=response)]],
+            )
+        else:
+            full_response = await acompletion_with_retry(
+                self, messages=messages, **params
+            )
+            return LLMResult(
+                generations=[
+                    [Generation(text=full_response["choices"][0]["message"]["content"])]
+                ],
+                llm_output={"token_usage": full_response["usage"]},
+            )
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        """Get the identifying parameters."""
+        return {**{"model_name": self.model_name}, **self._default_params}
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "openai-chat"
+class AzureOpenAIChat(OpenAIChat):
+    """Azure specific OpenAI class that uses deployment name."""
+    deployment_name: str = ""
+    """Deployment name to use."""
+    @property
+    def _identifying_params(self) -> Mapping[str, Any]:
+        return {
+            **{"deployment_name": self.deployment_name},
+            **super()._identifying_params,
+        }

streamlit_langchain_chat/customized_langchain/vectorstores/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Wrappers on top of vector stores."""
+from streamlit_langchain_chat.customized_langchain.vectorstores.faiss import FAISS
+from streamlit_langchain_chat.customized_langchain.vectorstores.pinecone import Pinecone
+__all__ = [
+    "FAISS",
+    "Pinecone",
+]

streamlit_langchain_chat/customized_langchain/vectorstores/faiss.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# import hashlib
+from langchain.vectorstores.faiss import *
+from langchain.vectorstores.faiss import FAISS as OriginalFAISS
+from streamlit_langchain_chat.customized_langchain.docstore.in_memory import InMemoryDocstore
+class FAISS(OriginalFAISS):
+    def __add(
+        self,
+        texts: Iterable[str],
+        embeddings: Iterable[List[float]],
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        if not isinstance(self.docstore, AddableMixin):
+            raise ValueError(
+                "If trying to add texts, the underlying docstore should support "
+                f"adding items, which {self.docstore} does not"
+            )
+        documents = []
+        for i, text in enumerate(texts):
+            metadata = metadatas[i] if metadatas else {}
+            documents.append(Document(page_content=text, metadata=metadata))
+        # Add to the index, the index_to_id mapping, and the docstore.
+        starting_len = len(self.index_to_docstore_id)
+        self.index.add(np.array(embeddings, dtype=np.float32))
+        # Get list of index, id, and docs.
+        full_info = [
+            (starting_len + i, str(uuid.uuid4()), doc)
+            for i, doc in enumerate(documents)
+        ]
+        # Add information to docstore and index.
+        self.docstore.add({_id: doc for _, _id, doc in full_info})
+        index_to_id = {index: _id for index, _id, _ in full_info}
+        self.index_to_docstore_id.update(index_to_id)
+        return [_id for _, _id, _ in full_info]
+    @classmethod
+    def __from(
+            cls,
+            texts: List[str],
+            embeddings: List[List[float]],
+            embedding: Embeddings,
+            metadatas: Optional[List[dict]] = None,
+            **kwargs: Any,
+    ) -> FAISS:
+        faiss = dependable_faiss_import()
+        index = faiss.IndexFlatL2(len(embeddings[0]))
+        index.add(np.array(embeddings, dtype=np.float32))
+        documents = []
+        for i, text in enumerate(texts):
+            metadata = metadatas[i] if metadatas else {}
+            documents.append(Document(page_content=text, metadata=metadata))
+        index_to_id = {i: str(uuid.uuid4()) for i in range(len(documents))}
+        # # TODO: cambiar para usar el hash. Y ver donde se pondria para que no cargara el chunk en el dataset
+        # index_to_id_2 = dict()
+        # for i in range(len(documents)):
+        #     h = hashlib.new('sha256')
+        #     text_ = documents[i].page_content
+        #     h.update(text_.encode())
+        #     index_to_id_2[i] = str(h.hexdigest())
+        # #
+        docstore = InMemoryDocstore(
+            {index_to_id[i]: doc for i, doc in enumerate(documents)}
+        )
+        return cls(embedding.embed_query, index, docstore, index_to_id)
+    @classmethod
+    def from_texts(
+            cls,
+            texts: List[str],
+            embedding: Embeddings,
+            metadatas: Optional[List[dict]] = None,
+            **kwargs: Any,
+    ) -> FAISS:
+        """Construct FAISS wrapper from raw documents.
+        This is a user friendly interface that:
+            1. Embeds documents.
+            2. Creates an in memory docstore
+            3. Initializes the FAISS database
+        This is intended to be a quick way to get started.
+        Example:
+            .. code-block:: python
+                from langchain import FAISS
+                from langchain.embeddings import OpenAIEmbeddings
+                embeddings = OpenAIEmbeddings()
+                faiss = FAISS.from_texts(texts, embeddings)
+        """
+        # embeddings = embedding.embed_documents(texts)
+        print(f"len(texts): {len(texts)}")  # TODO: borrar
+        embeddings = [embedding.embed_documents([text])[0] for text in texts]
+        print(f"len(embeddings): {len(embeddings)}")  # TODO: borrar
+        return cls.__from(texts, embeddings, embedding, metadatas, **kwargs)

streamlit_langchain_chat/customized_langchain/vectorstores/pinecone.py ADDED Viewed

	@@ -0,0 +1,79 @@

+from langchain.vectorstores.pinecone import *
+from langchain.vectorstores.pinecone import Pinecone as OriginalPinecone
+class Pinecone(OriginalPinecone):
+    @classmethod
+    def from_texts(
+            cls,
+            texts: List[str],
+            embedding: Embeddings,
+            metadatas: Optional[List[dict]] = None,
+            ids: Optional[List[str]] = None,
+            batch_size: int = 32,
+            text_key: str = "text",
+            index_name: Optional[str] = None,
+            namespace: Optional[str] = None,
+            **kwargs: Any,
+    ) -> Pinecone:
+        """Construct Pinecone wrapper from raw documents.
+        This is a user friendly interface that:
+            1. Embeds documents.
+            2. Adds the documents to a provided Pinecone index
+        This is intended to be a quick way to get started.
+        Example:
+            .. code-block:: python
+                from langchain import Pinecone
+                from langchain.embeddings import OpenAIEmbeddings
+                embeddings = OpenAIEmbeddings()
+                pinecone = Pinecone.from_texts(
+                    texts,
+                    embeddings,
+                    index_name="langchain-demo"
+                )
+        """
+        try:
+            import pinecone
+        except ImportError:
+            raise ValueError(
+                "Could not import pinecone python package. "
+                "Please install it with `pip install pinecone-client`."
+            )
+        _index_name = index_name or str(uuid.uuid4())
+        indexes = pinecone.list_indexes()  # checks if provided index exists
+        if _index_name in indexes:
+            index = pinecone.Index(_index_name)
+        else:
+            index = None
+        for i in range(0, len(texts), batch_size):
+            # set end position of batch
+            i_end = min(i + batch_size, len(texts))
+            # get batch of texts and ids
+            lines_batch = texts[i:i_end]
+            # create ids if not provided
+            if ids:
+                ids_batch = ids[i:i_end]
+            else:
+                ids_batch = [str(uuid.uuid4()) for n in range(i, i_end)]
+            # create embeddings
+            # embeds = embedding.embed_documents(lines_batch)
+            embeds = [embedding.embed_documents([line_batch])[0] for line_batch in lines_batch]
+            # prep metadata and upsert batch
+            if metadatas:
+                metadata = metadatas[i:i_end]
+            else:
+                metadata = [{} for _ in range(i, i_end)]
+            for j, line in enumerate(lines_batch):
+                metadata[j][text_key] = line
+            to_upsert = zip(ids_batch, embeds, metadata)
+            # Create index if it does not exist
+            if index is None:
+                pinecone.create_index(_index_name, dimension=len(embeds[0]))
+                index = pinecone.Index(_index_name)
+            # upsert to Pinecone
+            index.upsert(vectors=list(to_upsert), namespace=namespace)
+        return cls(index, embedding.embed_query, text_key, namespace)

streamlit_langchain_chat/dataset.py ADDED Viewed

	@@ -0,0 +1,740 @@

+import time
+from dataclasses import dataclass
+from datetime import datetime
+from functools import reduce
+import json
+import os
+from pathlib import Path
+import re
+import requests
+from requests.models import MissingSchema
+import sys
+from typing import List, Optional, Tuple, Dict, Callable, Any
+from bs4 import BeautifulSoup
+import docx
+from html2text import html2text
+import langchain
+from langchain.callbacks import get_openai_callback
+from langchain.cache import SQLiteCache
+from langchain.chains import LLMChain
+from langchain.chains.chat_vector_db.prompts import CONDENSE_QUESTION_PROMPT
+from langchain.chat_models import ChatOpenAI
+from langchain.chat_models.base import BaseChatModel
+from langchain.document_loaders import PyPDFLoader, PyMuPDFLoader
+from langchain.embeddings.base import Embeddings
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.llms import OpenAI
+from langchain.llms.base import LLM, BaseLLM
+from langchain.prompts.chat import AIMessagePromptTemplate
+from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter
+from langchain.vectorstores import Pinecone as OriginalPinecone
+import numpy as np
+import openai
+import pinecone
+from pptx import Presentation
+from pypdf import PdfReader
+import trafilatura
+from streamlit_langchain_chat.constants import *
+from streamlit_langchain_chat.customized_langchain.vectorstores import FAISS
+from streamlit_langchain_chat.customized_langchain.vectorstores import Pinecone
+from streamlit_langchain_chat.utils import maybe_is_text, maybe_is_truncated
+from streamlit_langchain_chat.prompts import *
+if REUSE_ANSWERS:
+    CACHE_PATH = TEMP_DIR / "llm_cache.db"
+    os.makedirs(os.path.dirname(CACHE_PATH), exist_ok=True)
+    langchain.llm_cache = SQLiteCache(str(CACHE_PATH))
+# option 1
+TextSplitter = TokenTextSplitter
+# option 2
+# TextSplitter = RecursiveCharacterTextSplitter  # usado por gpt4_pdf_chatbot_langchain (aka GPCL)
+@dataclass
+class Answer:
+    """A class to hold the answer to a question."""
+    question: str = ""
+    answer: str = ""
+    context: str = ""
+    chunks: str = ""
+    packages: List[Any] = None
+    references: str = ""
+    cost_str: str = ""
+    passages: Dict[str, str] = None
+    tokens: List[Dict] = None
+    def __post_init__(self):
+        """Initialize the answer."""
+        if self.packages is None:
+            self.packages = []
+        if self.passages is None:
+            self.passages = {}
+    def __str__(self) -> str:
+        """Return the answer as a string."""
+        return self.answer
+def parse_docx(path, citation, key, chunk_chars=2000, overlap=50):
+    try:
+        document = docx.Document(path)
+        fullText = []
+        for paragraph in document.paragraphs:
+            fullText.append(paragraph.text)
+        doc = '\n'.join(fullText) + '\n'
+    except Exception as e:
+        print(f"code_error: {e}")
+        sys.exit(1)
+    if doc:
+        text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
+        texts = text_splitter.split_text(doc)
+        return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
+    else:
+        return [], []
+# TODO: si pones un conector con el formato loader = ... ; data = loader.load();
+#  podrás poner todos los conectores de langchain
+# https://langchain.readthedocs.io/en/stable/modules/document_loaders/examples/pdf.html
+def parse_pdf(path, citation, key, chunk_chars=2000, overlap=50):
+    pdfFileObj = open(path, "rb")
+    pdfReader = PdfReader(pdfFileObj)
+    splits = []
+    split = ""
+    pages = []
+    metadatas = []
+    for i, page in enumerate(pdfReader.pages):
+        split += page.extract_text()
+        pages.append(str(i + 1))
+        # split could be so long it needs to be split
+        # into multiple chunks. Or it could be so short
+        # that it needs to be combined with the next chunk.
+        while len(split) > chunk_chars:
+            splits.append(split[:chunk_chars])
+            # pretty formatting of pages (e.g. 1-3, 4, 5-7)
+            pg = "-".join([pages[0], pages[-1]])
+            metadatas.append(
+                dict(
+                    citation=citation,
+                    dockey=key,
+                    key=f"{key} pages {pg}",
+                )
+            )
+            split = split[chunk_chars - overlap:]
+            pages = [str(i + 1)]
+    if len(split) > overlap:
+        splits.append(split[:chunk_chars])
+        pg = "-".join([pages[0], pages[-1]])
+        metadatas.append(
+            dict(
+                citation=citation,
+                dockey=key,
+                key=f"{key} pages {pg}",
+            )
+        )
+    pdfFileObj.close()
+    # # ### option 2. PyPDFLoader
+    # loader = PyPDFLoader(path)
+    # data = loader.load_and_split()
+    # # ### option 2.1. PyPDFLoader usado por GPCL, aunque luego usa el
+    # loader = PyPDFLoader(path)
+    # rawDocs = loader.load()
+    # text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
+    # texts = text_splitter.split_documents(rawDocs)
+    # # ### option 3. PDFMiner. Este parece la mejor opcion
+    # loader = PyMuPDFLoader(path)
+    # data = loader.load()
+    return splits, metadatas
+def parse_pptx(path, citation, key, chunk_chars=2000, overlap=50):
+    try:
+        presentation = Presentation(path)
+        fullText = []
+        for slide in presentation.slides:
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    fullText.append(shape.text)
+        doc = ''.join(fullText)
+        if doc:
+            text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
+            texts = text_splitter.split_text(doc)
+            return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
+        else:
+            return [], []
+    except Exception as e:
+        print(f"code_error: {e}")
+        sys.exit(1)
+def parse_txt(path, citation, key, chunk_chars=2000, overlap=50, html=False):
+    try:
+        with open(path) as f:
+            doc = f.read()
+    except UnicodeDecodeError as e:
+        with open(path, encoding="utf-8", errors="ignore") as f:
+            doc = f.read()
+    if html:
+        doc = html2text(doc)
+    # yo, no idea why but the texts are not split correctly
+    text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
+    texts = text_splitter.split_text(doc)
+    return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
+def parse_url(url: str, citation, key, chunk_chars=2000, overlap=50):
+    def beautifulsoup_extract_text_fallback(response_content):
+        """
+        This is a fallback function, so that we can always return a value for text content.
+        Even for when both Trafilatura and BeautifulSoup are unable to extract the text from a
+        single URL.
+        """
+        # Create the beautifulsoup object:
+        soup = BeautifulSoup(response_content, 'html.parser')
+        # Finding the text:
+        text = soup.find_all(text=True)
+        # Remove unwanted tag elements:
+        cleaned_text = ''
+        blacklist = [
+            '[document]',
+            'noscript',
+            'header',
+            'html',
+            'meta',
+            'head',
+            'input',
+            'script',
+            'style', ]
+        # Then we will loop over every item in the extract text and make sure that the beautifulsoup4 tag
+        # is NOT in the blacklist
+        for item in text:
+            if item.parent.name not in blacklist:
+                cleaned_text += f'{item} '  # cleaned_text += '{} '.format(item)
+        # Remove any tab separation and strip the text:
+        cleaned_text = cleaned_text.replace('\t', '')
+        return cleaned_text.strip()
+    def extract_text_from_single_web_page(url):
+        print(f"\n===========\n{url=}\n===========\n")
+        downloaded_url = trafilatura.fetch_url(url)
+        a = None
+        try:
+            a = trafilatura.extract(downloaded_url,
+                                    output_format='json',
+                                    with_metadata=True,
+                                    include_comments=False,
+                                    date_extraction_params={'extensive_search': True,
+                                                            'original_date': True})
+        except AttributeError:
+            a = trafilatura.extract(downloaded_url,
+                                    output_format='json',
+                                    with_metadata=True,
+                                    date_extraction_params={'extensive_search': True,
+                                                            'original_date': True})
+        except Exception as e:
+            print(f"code_error: {e}")
+        if a:
+            json_output = json.loads(a)
+            return json_output['text']
+        else:
+            try:
+                headers = {'User-Agent': 'Chrome/83.0.4103.106'}
+                resp = requests.get(url, headers=headers)
+                print(f"{resp=}\n")
+                # We will only extract the text from successful requests:
+                if resp.status_code == 200:
+                    return beautifulsoup_extract_text_fallback(resp.content)
+                else:
+                    # This line will handle for any failures in both the Trafilature and BeautifulSoup4 functions:
+                    return np.nan
+            # Handling for any URLs that don't have the correct protocol
+            except MissingSchema:
+                return np.nan
+    text_to_split = extract_text_from_single_web_page(url)
+    text_splitter = TextSplitter(chunk_size=chunk_chars, chunk_overlap=overlap)
+    texts = text_splitter.split_text(text_to_split)
+    return texts, [dict(citation=citation, dockey=key, key=key)] * len(texts)
+def read_source(path: str = None,
+                citation: str = None,
+                key: str = None,
+                chunk_chars: int = 3000,
+                overlap: int = 100,
+                disable_check: bool = False):
+    if path.endswith(".pdf"):
+        return parse_pdf(path, citation, key, chunk_chars, overlap)
+    elif path.endswith(".txt"):
+        return parse_txt(path, citation, key, chunk_chars, overlap)
+    elif path.endswith(".html"):
+        return parse_txt(path, citation, key, chunk_chars, overlap, html=True)
+    elif path.endswith(".docx"):
+        return parse_docx(path, citation, key, chunk_chars, overlap)
+    elif path.endswith(".pptx"):
+        return parse_pptx(path, citation, key, chunk_chars, overlap)
+    elif path.startswith("http://") or path.startswith("https://"):
+        return parse_url(path, citation, key, chunk_chars, overlap)
+    # TODO: poner mas conectores
+    # else:
+    #     return parse_code_txt(path, citation, key, chunk_chars, overlap)
+    else:
+        raise "unknown extension"
+class Dataset:
+    """A collection of documents to be used for answering questions."""
+    def __init__(
+            self,
+            chunk_size_limit: int = 3000,
+            llm: Optional[BaseLLM] | Optional[BaseChatModel] = None,
+            summary_llm: Optional[BaseLLM] = None,
+            name: str = "default",
+            index_path: Optional[Path] = None,
+    ) -> None:
+        """Initialize the collection of documents.
+        Args:
+            chunk_size_limit: The maximum number of characters to use for a single chunk of text.
+            llm: The language model to use for answering questions. Default - OpenAI chat-gpt-turbo
+            summary_llm: The language model to use for summarizing documents. If None, llm is used.
+            name: The name of the collection.
+            index_path: The path to the index file IF pickled. If None, defaults to using name in $HOME/.paperqa/name
+        """
+        self.docs = dict()
+        self.keys = set()
+        self.chunk_size_limit = chunk_size_limit
+        self.index_docstore = None
+        if llm is None:
+            llm = ChatOpenAI(temperature=0.1, max_tokens=512)
+        if summary_llm is None:
+            summary_llm = llm
+        self.update_llm(llm, summary_llm)
+        if index_path is None:
+            index_path = TEMP_DIR / name
+        self.index_path = index_path
+        self.name = name
+    def update_llm(self, llm: BaseLLM | ChatOpenAI, summary_llm: Optional[BaseLLM] = None) -> None:
+        """Update the LLM for answering questions."""
+        self.llm = llm
+        if summary_llm is None:
+            summary_llm = llm
+        self.summary_llm = summary_llm
+        self.summary_chain = LLMChain(prompt=chat_summary_prompt, llm=summary_llm)
+        self.search_chain = LLMChain(prompt=search_prompt, llm=llm)
+        self.cite_chain = LLMChain(prompt=citation_prompt, llm=llm)
+    def add(
+        self,
+        path: str,
+        citation: Optional[str] = None,
+        key: Optional[str] = None,
+        disable_check: bool = False,
+        chunk_chars: Optional[int] = 3000,
+    ) -> None:
+        """Add a document to the collection."""
+        if path in self.docs:
+            print(f"Document {path} already in collection.")
+            return None
+        if citation is None:
+            # peak first chunk
+            texts, _ = read_source(path, "", "", chunk_chars=chunk_chars)
+            with get_openai_callback() as cb:
+                citation = self.cite_chain.run(texts[0])
+            if len(citation) < 3 or "Unknown" in citation or "insufficient" in citation:
+                citation = f"Unknown, {os.path.basename(path)}, {datetime.now().year}"
+        if key is None:
+            # get first name and year from citation
+            try:
+                author = re.search(r"([A-Z][a-z]+)", citation).group(1)
+            except AttributeError:
+                # panicking - no word??
+                raise ValueError(
+                    f"Could not parse key from citation {citation}. Consider just passing key explicitly - e.g. docs.py (path, citation, key='mykey')"
+                )
+            try:
+                year = re.search(r"(\d{4})", citation).group(1)
+            except AttributeError:
+                year = ""
+            key = f"{author}{year}"
+        suffix = ""
+        while key + suffix in self.keys:
+            # move suffix to next letter
+            if suffix == "":
+                suffix = "a"
+            else:
+                suffix = chr(ord(suffix) + 1)
+        key += suffix
+        self.keys.add(key)
+        texts, metadata = read_source(path, citation, key, chunk_chars=chunk_chars)
+        # loose check to see if document was loaded
+        #
+        if len("".join(texts)) < 10 or (
+            not disable_check and not maybe_is_text("".join(texts))
+        ):
+            raise ValueError(
+                f"This does not look like a text document: {path}. Path disable_check to ignore this error."
+            )
+        self.docs[path] = dict(texts=texts, metadata=metadata, key=key)
+        if self.index_docstore is not None:
+            self.index_docstore.add_texts(texts, metadatas=metadata)
+    def clear(self) -> None:
+        """Clear the collection of documents."""
+        self.docs = dict()
+        self.keys = set()
+        self.index_docstore = None
+        # delete index file
+        pkl = self.index_path / "index.pkl"
+        if pkl.exists():
+            pkl.unlink()
+        fs = self.index_path / "index.faiss"
+        if fs.exists():
+            fs.unlink()
+    @property
+    def doc_previews(self) -> List[Tuple[int, str, str]]:
+        """Return a list of tuples of (key, citation) for each document."""
+        return [
+            (
+                len(doc["texts"]),
+                doc["metadata"][0]["dockey"],
+                doc["metadata"][0]["citation"],
+            )
+            for doc in self.docs.values()
+        ]
+    # to pickle, we have to save the index as a file
+    def __getstate__(self, embedding: Embeddings):
+        if embedding is None:
+            embedding = OpenAIEmbeddings()
+        if self.index_docstore is None and len(self.docs) > 0:
+            self._build_faiss_index(embedding)
+        state = self.__dict__.copy()
+        if self.index_docstore is not None:
+            state["_index"].save_local(self.index_path)
+        del state["_index"]
+        # remove LLMs (they can have callbacks, which can't be pickled)
+        del state["summary_chain"]
+        del state["qa_chain"]
+        del state["cite_chain"]
+        del state["search_chain"]
+        return state
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        try:
+            self.index_docstore = FAISS.load_local(self.index_path, OpenAIEmbeddings())
+        except:
+            # they use some special exception type, but I don't want to import it
+            self.index_docstore = None
+        self.update_llm(
+            ChatOpenAI(temperature=0.1, max_tokens=512)
+        )
+    def _build_faiss_index(self, embedding: Embeddings = None):
+        if embedding is None:
+            embedding = OpenAIEmbeddings()
+        if self.index_docstore is None:
+            texts = reduce(
+                lambda x, y: x + y, [doc["texts"] for doc in self.docs.values()], []
+            )
+            metadatas = reduce(
+                lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
+            )
+            # if the index exists, load it
+            if LOAD_INDEX_LOCALLY and (self.index_path / "index.faiss").exists():
+                self.index_docstore = FAISS.load_local(self.index_path, embedding)
+                # search if the text and metadata already existed in the index
+                for i in reversed(range(len(texts))):
+                    text = texts[i]
+                    metadata = metadatas[i]
+                    for key, value in self.index_docstore.docstore.dict_.items():
+                        if value.page_content == text:
+                            if value.metadata.get('citation').split(os.sep)[-1] != metadata.get('citation').split(os.sep)[-1]:
+                                self.index_docstore.docstore.dict_[key].metadata['citation'] = metadata.get('citation').split(os.sep)[-1]
+                                self.index_docstore.docstore.dict_[key].metadata['dockey'] = metadata.get('citation').split(os.sep)[-1]
+                                self.index_docstore.docstore.dict_[key].metadata['key'] = metadata.get('citation').split(os.sep)[-1]
+                            texts.pop(i)
+                            metadatas.pop(i)
+                # add remaining texts
+                if texts:
+                    self.index_docstore.add_texts(texts=texts, metadatas=metadatas)
+            else:
+                # crete new index
+                self.index_docstore = FAISS.from_texts(texts, embedding, metadatas=metadatas)
+            #
+            if SAVE_INDEX_LOCALLY:
+                # save index.
+                self.index_docstore.save_local(self.index_path)
+    def _build_pinecone_index(self, embedding: Embeddings = None):
+        if embedding is None:
+            embedding = OpenAIEmbeddings()
+        if self.index_docstore is None:
+            pinecone.init(
+                api_key=os.environ['PINECONE_API_KEY'],  # find at app.pinecone.io
+                environment=os.environ['PINECONE_ENVIRONMENT']  # next to api key in console
+            )
+            texts = reduce(
+                lambda x, y: x + y, [doc["texts"] for doc in self.docs.values()], []
+            )
+            metadatas = reduce(
+                lambda x, y: x + y, [doc["metadata"] for doc in self.docs.values()], []
+            )
+            # TODO: que cuando exista que no lo borre, sino que lo actualice
+            # index_name = "langchain-demo1"
+            # if index_name in pinecone.list_indexes():
+            #     self.index_docstore = pinecone.Index(index_name)
+            #     vectors = []
+            #     for text, metadata in zip(texts, metadatas):
+            #         # embed = <faltaria saber con que embedding se hizo el index que ya existia>
+            #     self.index_docstore.upsert(vectors=vectors)
+            # else:
+            #     if openai.api_type == 'azure':
+            #         self.index_docstore = Pinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
+            #     else:
+            #         self.index_docstore = OriginalPinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
+            index_name = "langchain-demo1"
+            # if the index exists, delete it
+            if index_name in pinecone.list_indexes():
+                pinecone.delete_index(index_name)
+            # create new index
+            if openai.api_type == 'azure':
+                self.index_docstore = Pinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
+            else:
+                self.index_docstore = OriginalPinecone.from_texts(texts, embedding, metadatas=metadatas, index_name=index_name)
+    def get_evidence(
+        self,
+        answer: Answer,
+        embedding: Embeddings,
+        k: int = 3,
+        max_sources: int = 5,
+        marginal_relevance: bool = True,
+    ) -> str:
+        if self.index_docstore is None:
+            self._build_faiss_index(embedding)
+        init_search_time = time.time()
+        # want to work through indices but less k
+        if marginal_relevance:
+            docs = self.index_docstore.max_marginal_relevance_search(
+                answer.question, k=k, fetch_k=5 * k
+            )
+        else:
+            docs = self.index_docstore.similarity_search(
+                answer.question, k=k, fetch_k=5 * k
+            )
+        if OPERATING_MODE == "debug":
+            print(f"time to search docs to build context: {time.time() - init_search_time:.2f} [s]")
+        init_summary_time = time.time()
+        partial_summary_time = ""
+        for i, doc in enumerate(docs):
+            with get_openai_callback() as cb:
+                init__partial_summary_time = time.time()
+                summary_of_chunked_text = self.summary_chain.run(
+                    question=answer.question, context_str=doc.page_content
+                )
+                if OPERATING_MODE == "debug":
+                    partial_summary_time += f"- time to make relevant summary of doc '{i}': {time.time() - init__partial_summary_time:.2f} [s]\n"
+                engine = self.summary_chain.llm.model_kwargs.get('deployment_id') or self.summary_chain.llm.model_name
+                if not answer.tokens:
+                    answer.tokens = [{
+                        'engine': engine,
+                        'total_tokens': cb.total_tokens}]
+                else:
+                    answer.tokens.append({
+                        'engine': engine,
+                        'total_tokens': cb.total_tokens
+                    })
+            summarized_package = (
+                doc.metadata["key"],
+                doc.metadata["citation"],
+                summary_of_chunked_text,
+                doc.page_content,
+            )
+            if "Not applicable" not in summary_of_chunked_text and summarized_package not in answer.packages:
+                answer.packages.append(summarized_package)
+                yield answer
+            if len(answer.packages) == max_sources:
+                break
+        if OPERATING_MODE == "debug":
+            print(f"time to make all relevant summaries: {time.time() - init_summary_time:.2f} [s]")
+            # no se printea el ultimo caracter porque es un \n
+            print(partial_summary_time[:-1])
+        context_str = "\n\n".join(
+            [f"{citation}: {summary_of_chunked_text}"
+             for key, citation, summary_of_chunked_text, chunked_text in answer.packages
+             if "Not applicable" not in summary_of_chunked_text]
+        )
+        chunks_str = "\n\n".join(
+            [f"{citation}: {chunked_text}"
+             for key, citation, summary_of_chunked_text, chunked_text in answer.packages
+             if "Not applicable" not in summary_of_chunked_text]
+        )
+        valid_keys = [key
+                      for key, citation, summary_of_chunked_text, chunked_textin in answer.packages
+                      if "Not applicable" not in summary_of_chunked_text]
+        if len(valid_keys) > 0:
+            context_str += "\n\nValid keys: " + ", ".join(valid_keys)
+            chunks_str += "\n\nValid keys: " + ", ".join(valid_keys)
+        answer.context = context_str
+        answer.chunks = chunks_str
+        yield answer
+    def query(
+        self,
+        query: str,
+        embedding: Embeddings,
+        chat_history: list[tuple[str, str]],
+        k: int = 10,
+        max_sources: int = 5,
+        length_prompt: str = "about 100 words",
+        marginal_relevance: bool = True,
+    ):
+        for answer in self._query(
+            query,
+            embedding,
+            chat_history,
+            k=k,
+            max_sources=max_sources,
+            length_prompt=length_prompt,
+            marginal_relevance=marginal_relevance,
+        ):
+            pass
+        return answer
+    def _query(
+        self,
+        query: str,
+        embedding: Embeddings,
+        chat_history: list[tuple[str, str]],
+        k: int,
+        max_sources: int,
+        length_prompt: str,
+        marginal_relevance: bool,
+    ):
+        if k < max_sources:
+            k = max_sources + 1
+        answer = Answer(question=query)
+        messages_qa = [system_message_prompt]
+        if len(chat_history) != 0:
+            for conversation in chat_history:
+                messages_qa.append(HumanMessagePromptTemplate.from_template(conversation[0]))
+                messages_qa.append(AIMessagePromptTemplate.from_template(conversation[1]))
+        messages_qa.append(human_qa_message_prompt)
+        chat_qa_prompt = ChatPromptTemplate.from_messages(messages_qa)
+        self.qa_chain = LLMChain(prompt=chat_qa_prompt, llm=self.llm)
+        for answer in self.get_evidence(
+                answer,
+                embedding,
+                k=k,
+                max_sources=max_sources,
+                marginal_relevance=marginal_relevance,
+        ):
+            yield answer
+        references_dict = dict()
+        passages = dict()
+        if len(answer.context) < 10:
+            answer_text = "I cannot answer this question due to insufficient information."
+        else:
+            with get_openai_callback() as cb:
+                init_qa_time = time.time()
+                answer_text = self.qa_chain.run(
+                    question=answer.question, context_str=answer.context, length=length_prompt
+                )
+                if OPERATING_MODE == "debug":
+                    print(f"time to make the Q&A answer: {time.time() - init_qa_time:.2f} [s]")
+                engine = self.qa_chain.llm.model_kwargs.get('deployment_id') or self.qa_chain.llm.model_name
+                if not answer.tokens:
+                    answer.tokens = [{
+                        'engine': engine,
+                        'total_tokens': cb.total_tokens}]
+                else:
+                    answer.tokens.append({
+                        'engine': engine,
+                        'total_tokens': cb.total_tokens
+                    })
+        # it still happens lol
+        if "(Foo2012)" in answer_text:
+            answer_text = answer_text.replace("(Foo2012)", "")
+        for key, citation, summary, text in answer.packages:
+            # do check for whole key (so we don't catch Callahan2019a with Callahan2019)
+            skey = key.split(" ")[0]
+            if skey + " " in answer_text or skey + ")" in answer_text:
+                references_dict[skey] = citation
+                passages[key] = text
+        references_str = "\n\n".join(
+            [f"{i+1}. ({k}): {c}" for i, (k, c) in enumerate(references_dict.items())]
+        )
+        # cost_str = f"{answer_text}\n\n"
+        cost_str = ""
+        itemized_cost = ""
+        total_amount = 0
+        for d in answer.tokens:
+            total_tokens = d.get('total_tokens')
+            if total_tokens:
+                engine = d.get('engine')
+                key_price = None
+                for key in PRICES.keys():
+                    if re.match(f"{key}", engine):
+                        key_price = key
+                        break
+                if PRICES.get(key_price):
+                    partial_amount = total_tokens / 1000 * PRICES.get(key_price)
+                    total_amount += partial_amount
+                    itemized_cost += f"- {engine}: {total_tokens} tokens\t ---> ${partial_amount:.4f},\n"
+                else:
+                    itemized_cost += f"- {engine}: {total_tokens} tokens,\n"
+        # delete ,\n
+        itemized_cost = itemized_cost[:-2]
+        # add tokens to formatted answer
+        cost_str += f"Total cost: ${total_amount:.4f}\nItemized cost:\n{itemized_cost}"
+        answer.answer = answer_text
+        answer.cost_str = cost_str
+        answer.references = references_str
+        answer.passages = passages
+        yield answer

streamlit_langchain_chat/inputs/__init__.py ADDED Viewed

File without changes

streamlit_langchain_chat/prompts.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import langchain.prompts as prompts
+from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
+from datetime import datetime
+summary_template = """Summarize and provide direct quotes from the text below to help answer a question.
+Do not directly answer the question, instead provide a summary and quotes with the context of the user's question.
+Do not use outside sources.
+Reply with "Not applicable" if the text is unrelated to the question.
+Use 75 or less words.
+Remember, if the user does not specify a language, reply in the language of the user's question.
+{context_str}
+User's question: {question}
+Relevant Information Summary:"""
+summary_prompt = prompts.PromptTemplate(
+    input_variables=["question", "context_str"],
+    template=summary_template,
+)
+qa_template = """Write an answer for the user's question below solely based on the provided context.
+If the user does not specify how many words the answer should be, the length of the answer should be {length}.
+If the context is irrelevant, reply "Your question falls outside the scope of University of Sydney policy, so I cannot answer".
+For each sentence in your answer, indicate which sources most support it via valid citation markers at the end of sentences, like (Example2012).
+Answer in an unbiased and professional tone.
+Make clear what is your opinion.
+Use Markdown for formatting code or text, and try to use direct quotes to support arguments.
+Remember, if the user does not specify a language, answer in the language of the user's question.
+Context:
+{context_str}
+User's question: {question}
+Answer:
+"""
+qa_prompt = prompts.PromptTemplate(
+    input_variables=["question", "context_str", "length"],
+    template=qa_template,
+)
+# usado por GPCL
+qa_prompt_GPCL = prompts.PromptTemplate(
+    input_variables=["question", "context_str"],
+    template="You are an AI assistant providing helpful advice about University of Sydney policy. You are given the following extracted parts of a long document and a question. Provide a conversational answer based on the context provided."
+    "You should only provide hyperlinks that reference the context below. Do NOT make up hyperlinks."
+    'If you can not find the answer in the context below, just say "Hmm, I am not sure. Could you please rephrase your question?" Do not try to make up an answer.'
+    "If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.\n\n"
+    "Question: {question}\n"
+    "=========\n"
+    "{context_str}\n"
+    "=========\n"
+    "Answer in Markdown:",
+)
+search_prompt = prompts.PromptTemplate(
+    input_variables=["question"],
+    template="We want to answer the following question: {question} \n"
+    "Provide three different targeted keyword searches (one search per line) "
+    "that will find papers that help answer the question. Do not use boolean operators. "
+    "Recent years are 2021, 2022, 2023.\n\n"
+    "1.",
+)
+def _get_datetime():
+    now = datetime.now()
+    return now.strftime("%m/%d/%Y")
+citation_prompt = prompts.PromptTemplate(
+    input_variables=["text"],
+    template="Provide a possible citation for the following text in MLA Format. Today's date is {date}\n"
+    "{text}\n\n"
+    "Citation:",
+    partial_variables={"date": _get_datetime},
+)
+system_template = """You are an AI chatbot with knowledge of the University of Sydney's legal policies that answers in an unbiased, professional tone.
+You sometimes refuse to answer if there is insufficient information.
+If the user does not specify a language, answer in the language of the user's question. """
+system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
+human_summary_message_prompt = HumanMessagePromptTemplate.from_template(summary_template)
+chat_summary_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_summary_message_prompt])
+human_qa_message_prompt = HumanMessagePromptTemplate.from_template(qa_template)
+# chat_qa_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_qa_message_prompt])  # TODO: borrar
+# human_condense_message_prompt = HumanMessagePromptTemplate.from_template(condense_template)
+# chat_condense_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_condense_message_prompt])

streamlit_langchain_chat/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,561 @@

+# -*- coding: utf-8 -*-
+"""
+To run:
+- activate the virtual environment
+- streamlit run path\to\streamlit_app.py
+"""
+import logging
+import os
+import re
+import sys
+import time
+import warnings
+import shutil
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings.openai import OpenAIEmbeddings
+import openai
+import pandas as pd
+import streamlit as st
+from st_aggrid import GridOptionsBuilder, AgGrid, GridUpdateMode, ColumnsAutoSizeMode
+from streamlit_chat import message
+from streamlit_langchain_chat.constants import *
+from streamlit_langchain_chat.customized_langchain.llms import OpenAI, AzureOpenAI, AzureOpenAIChat
+from streamlit_langchain_chat.dataset import Dataset
+# Configure logger
+logging.basicConfig(format="\n%(asctime)s\n%(message)s", level=logging.INFO, force=True)
+logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
+warnings.filterwarnings('ignore')
+if 'generated' not in st.session_state:
+    st.session_state['generated'] = []
+if 'past' not in st.session_state:
+    st.session_state['past'] = []
+if 'costs' not in st.session_state:
+    st.session_state['costs'] = []
+if 'contexts' not in st.session_state:
+    st.session_state['contexts'] = []
+if 'chunks' not in st.session_state:
+    st.session_state['chunks'] = []
+if 'user_input' not in st.session_state:
+    st.session_state['user_input'] = ""
+if 'dataset' not in st.session_state:
+    st.session_state['dataset'] = None
+def check_api_keys() -> bool:
+    source_id = app.params['source_id']
+    index_id = app.params['index_id']
+    open_api_key = os.getenv('OPENAI_API_KEY', '')
+    openapi_api_key_ready = type(open_api_key) is str and len(open_api_key) > 0
+    pinecone_api_key = os.getenv('PINECONE_API_KEY', '')
+    pinecone_api_key_ready = type(pinecone_api_key) is str and len(pinecone_api_key) > 0 if index_id == 2 else True
+    is_ready = True if openapi_api_key_ready and pinecone_api_key_ready else False
+    return is_ready
+def check_combination_point() -> bool:
+    type_id = app.params['type_id']
+    open_api_key = os.getenv('OPENAI_API_KEY', '')
+    openapi_api_key_ready = type(open_api_key) is str and len(open_api_key) > 0
+    api_base = app.params['api_base']
+    if type_id == 1:
+        deployment_id = app.params['deployment_id']
+        return True if openapi_api_key_ready and api_base and deployment_id else False
+    elif type_id == 2:
+        return True if openapi_api_key_ready and api_base else False
+    else:
+        return False
+def check_index() -> bool:
+    dataset = st.session_state['dataset']
+    index_built = dataset.index_docstore if hasattr(dataset, "index_docstore") else False
+    without_source = app.params['source_id'] == 4
+    is_ready = True if index_built or without_source else False
+    return is_ready
+def check_index_point() -> bool:
+    index_id = app.params['index_id']
+    pinecone_api_key = os.getenv('PINECONE_API_KEY', '')
+    pinecone_api_key_ready = type(pinecone_api_key) is str and len(pinecone_api_key) > 0 if index_id == 2 else True
+    pinecone_environment = os.getenv('PINECONE_ENVIRONMENT', False) if index_id == 2 else True
+    is_ready = True if index_id and pinecone_api_key_ready and pinecone_environment else False
+    return is_ready
+def check_params_point() -> bool:
+    max_sources = app.params['max_sources']
+    temperature = app.params['temperature']
+    is_ready = True if max_sources and isinstance(temperature, float) else False
+    return is_ready
+def check_source_point() -> bool:
+    return True
+def clear_chat_history():
+    if st.session_state['past'] or st.session_state['generated'] or st.session_state['contexts'] or st.session_state['chunks'] or st.session_state['costs']:
+        st.session_state['past'] = []
+        st.session_state['generated'] = []
+        st.session_state['contexts'] = []
+        st.session_state['chunks'] = []
+        st.session_state['costs'] = []
+def clear_index():
+    if dataset := st.session_state['dataset']:
+        # delete directory (with files)
+        index_path = dataset.index_path
+        if index_path.exists():
+            shutil.rmtree(str(index_path))
+        # update variable
+        st.session_state['dataset'] = None
+    elif (TEMP_DIR / "default").exists():
+        shutil.rmtree(str(TEMP_DIR / "default"))
+def check_sources() -> bool:
+    uploaded_files_rows = app.params['uploaded_files_rows']
+    urls_df = app.params['urls_df']
+    source_id = app.params['source_id']
+    some_files = True if uploaded_files_rows and uploaded_files_rows[-1].get('filepath') != "" else False
+    some_urls = bool([True for url, citation in urls_df.to_numpy() if url])
+    only_local_files = some_files and not some_urls
+    only_urls = not some_files and some_urls
+    is_ready = only_local_files or only_urls or (source_id == 4)
+    return is_ready
+def collect_dataset_and_built_index():
+    start = time.time()
+    uploaded_files_rows = app.params['uploaded_files_rows']
+    urls_df = app.params['urls_df']
+    type_id = app.params['type_id']
+    temperature = app.params['temperature']
+    index_id = app.params['index_id']
+    api_base = app.params['api_base']
+    deployment_id = app.params['deployment_id']
+    some_files = True if uploaded_files_rows and uploaded_files_rows[-1].get('filepath') != "" else False
+    some_urls = bool([True for url, citation in urls_df.to_numpy() if url])
+    openai.api_type = "azure" if type_id == 1 else "open_ai"
+    openai.api_base = api_base
+    openai.api_version = "2023-03-15-preview" if type_id == 1 else None
+    if deployment_id != "text-davinci-003":
+        dataset = Dataset(
+            llm=ChatOpenAI(
+                temperature=temperature,
+                max_tokens=512,
+                deployment_id=deployment_id,
+            )
+        )
+    else:
+        dataset = Dataset(
+            llm=OpenAI(
+                temperature=temperature,
+                max_tokens=512,
+                deployment_id=COMBINATIONS_OPTIONS.get(combination_id).get('deployment_name'),
+            )
+        )
+    # get url documents
+    if some_urls:
+        urls_df = urls_df.reset_index()
+        for url_index, url_row in urls_df.iterrows():
+            url = url_row.get('urls', '')
+            citation = url_row.get('citation string', '')
+            if url:
+                try:
+                    dataset.add(
+                        url,
+                        citation,
+                        citation,
+                        disable_check=True  # True to accept Japanese letters
+                    )
+                except Exception as e:
+                    print(e)
+                    pass
+    # dataset is pandas dataframe
+    if some_files:
+        for uploaded_files_row in uploaded_files_rows:
+            key = uploaded_files_row.get('citation string') if ',' not in uploaded_files_row.get('citation string') else None
+            dataset.add(
+                uploaded_files_row.get('filepath'),
+                uploaded_files_row.get('citation string'),
+                key=key,
+                disable_check=True  # True to accept Japanese letters
+            )
+    openai_embeddings = OpenAIEmbeddings(
+        document_model_name="text-embedding-ada-002",
+        query_model_name="text-embedding-ada-002",
+    )
+    if index_id == 1:
+        dataset._build_faiss_index(openai_embeddings)
+    else:
+        dataset._build_pinecone_index(openai_embeddings)
+    st.session_state['dataset'] = dataset
+    if OPERATING_MODE == "debug":
+        print(f"time to collect dataset: {time.time() - start:.2f} [s]")
+def configure_streamlit_and_page():
+    # Configure Streamlit page and state
+    st.set_page_config(**ST_CONFIG)
+    # Force responsive layout for columns also on mobile
+    st.write(
+        """<style>
+        [data-testid="column"] {
+            width: calc(50% - 1rem);
+            flex: 1 1 calc(50% - 1rem);
+            min-width: calc(50% - 1rem);
+        }
+        </style>""",
+        unsafe_allow_html=True,
+    )
+def get_answer():
+    query = st.session_state['user_input']
+    dataset = st.session_state['dataset']
+    type_id = app.params['type_id']
+    index_id = app.params['index_id']
+    max_sources = app.params['max_sources']
+    if query and dataset and type_id and index_id:
+        chat_history = [(past, generated)
+                        for (past, generated) in zip(st.session_state['past'], st.session_state['generated'])]
+        marginal_relevance = False if not index_id == 1 else True
+        start = time.time()
+        openai_embeddings = OpenAIEmbeddings(
+            document_model_name="text-embedding-ada-002",
+            query_model_name="text-embedding-ada-002",
+        )
+        result = dataset.query(
+            query,
+            openai_embeddings,
+            chat_history,
+            marginal_relevance=marginal_relevance,  # if pinecone is used it must be False
+        )
+        if OPERATING_MODE == "debug":
+            print(f"time to get answer: {time.time() - start:.2f} [s]")
+            print("-" * 10)
+        # response = {'generated_text': result.formatted_answer}
+        # response = {'generated_text': f"test_{len(st.session_state['generated'])} by {query}"}  # @debug
+        return result
+    else:
+        return None
+def load_main_page():
+    """
+    Load the body of web.
+    """
+    # Streamlit	    HTML	Markdown
+    # st.title	    <h1>	#
+    # st.header	    <h2>	##
+    # st.subheader	<h3>	###
+    st.markdown(f"## Augmented-Retrieval Q&A ChatGPT ({APP_VERSION})")
+    validate_status()
+    st.markdown(f"#### **Status**: {app.params['status']}")
+    # hidden div with anchor
+    st.markdown("<div id='linkto_top'></div>", unsafe_allow_html=True)
+    col1, col2, col3 = st.columns(3)
+    col1.button(label="clear index", type="primary", on_click=clear_index)
+    col2.button(label="clear conversation", type="primary", on_click=clear_chat_history)
+    col3.markdown("<a href='#linkto_bottom'>Link to bottom</a>", unsafe_allow_html=True)
+    if st.session_state["generated"]:
+        for i in range(len(st.session_state["generated"])):
+            message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
+            message(st.session_state['generated'][i], key=str(i))
+            with st.expander("See context"):
+                st.write(st.session_state['contexts'][i])
+            with st.expander("See chunks"):
+                st.write(st.session_state['chunks'][i])
+            with st.expander("See costs"):
+                st.write(st.session_state['costs'][i])
+    dataset = st.session_state['dataset']
+    index_built = dataset.index_docstore if hasattr(dataset, "index_docstore") else False
+    without_source = app.params['source_id'] == 4
+    enable_chat_button = index_built or without_source
+    st.text_input("You:",
+                  key='user_input',
+                  on_change=on_enter,
+                  disabled=not enable_chat_button
+                  )
+    st.markdown("<a href='#linkto_top'>Link to top</a>", unsafe_allow_html=True)
+    # hidden div with anchor
+    st.markdown("<div id='linkto_bottom'></div>", unsafe_allow_html=True)
+def load_sidebar_page():
+    st.sidebar.markdown("## Instructions")
+    # ############ #
+    # SOURCES TYPE #
+    # ############ #
+    st.sidebar.markdown("1. Select a source:")
+    source_selected = st.sidebar.selectbox(
+        "Choose the location of your info to give context to chatgpt",
+        [key for key, value in SOURCES_IDS.items()])
+    app.params['source_id'] = SOURCES_IDS.get(source_selected, None)
+    # ##### #
+    # MODEL #
+    # ##### #
+    st.sidebar.markdown("2. Select a model (LLM):")
+    combination_selected = st.sidebar.selectbox(
+        "Choose type: MSF Azure OpenAI and model / OpenAI",
+        [key for key, value in TYPE_IDS.items()])
+    app.params['type_id'] = TYPE_IDS.get(combination_selected, None)
+    if app.params['type_id'] == 1:  # with AzureOpenAI endpoint
+        # https://docs.streamlit.io/library/api-reference/widgets/st.text_input
+        os.environ['OPENAI_API_KEY'] = st.sidebar.text_input(
+            label="Enter Azure OpenAI API Key",
+            type="password"
+        ).strip()
+        app.params['api_base'] = st.sidebar.text_input(
+            label="Enter Azure API base",
+            placeholder="https://<api_base_endpoint>.openai.azure.com/",
+        ).strip()
+        app.params['deployment_id'] = st.sidebar.text_input(
+            label="Enter Azure deployment_id",
+        ).strip()
+    elif app.params['type_id'] == 2:  # with OpenAI endpoint
+        os.environ['OPENAI_API_KEY'] = st.sidebar.text_input(
+            label="Enter OpenAI API Key",
+            placeholder="sk-...",
+            type="password"
+        ).strip()
+        app.params['api_base'] = "https://api.openai.com/v1"
+        app.params['deployment_id'] = None
+    # ####### #
+    # INDEXES #
+    # ####### #
+    st.sidebar.markdown("3. Select a index store:")
+    index_selected = st.sidebar.selectbox(
+        "Type of Index",
+        [key for key, value in INDEX_IDS.items()])
+    app.params['index_id'] = INDEX_IDS.get(index_selected, None)
+    if app.params['index_id'] == 2:  # with pinecone
+        os.environ['PINECONE_API_KEY'] = st.sidebar.text_input(
+            label="Enter pinecone API Key",
+            type="password"
+        ).strip()
+        os.environ['PINECONE_ENVIRONMENT'] = st.sidebar.text_input(
+            label="Enter pinecone environment",
+            placeholder="eu-west1-gcp",
+        ).strip()
+    # ############## #
+    # CONFIGURATIONS #
+    # ############## #
+    st.sidebar.markdown("4. Choose configuration:")
+    # https://docs.streamlit.io/library/api-reference/widgets/st.number_input
+    max_sources = st.sidebar.number_input(
+        label="Top-k: Number of chunks/sections (1-5)",
+        step=1,
+        format="%d",
+        value=5
+    )
+    app.params['max_sources'] = max_sources
+    temperature = st.sidebar.number_input(
+        label="Temperature (0.0 – 1.0)",
+        step=0.1,
+        format="%f",
+        value=0.0,
+        min_value=0.0,
+        max_value=1.0
+    )
+    app.params['temperature'] = round(temperature, 1)
+    # ############## #
+    # UPLOAD SOURCES #
+    # ############## #
+    app.params['uploaded_files_rows'] = []
+    if app.params['source_id'] == 1:
+        # https://docs.streamlit.io/library/api-reference/widgets/st.file_uploader
+        # https://towardsdatascience.com/make-dataframes-interactive-in-streamlit-c3d0c4f84ccb
+        st.sidebar.markdown("""5. Upload your local documents and modify citation strings (optional)""")
+        uploaded_files = st.sidebar.file_uploader(
+            "Choose files",
+            accept_multiple_files=True,
+            type=['pdf', 'PDF',
+                  'txt', 'TXT',
+                  'html',
+                  'docx', 'DOCX',
+                  'pptx', 'PPTX',
+                  ],
+        )
+        uploaded_files_dataset = request_pathname(uploaded_files)
+        uploaded_files_df = pd.DataFrame(
+            uploaded_files_dataset,
+            columns=['filepath', 'citation string'])
+        uploaded_files_grid_options_builder = GridOptionsBuilder.from_dataframe(uploaded_files_df)
+        uploaded_files_grid_options_builder.configure_selection(
+            selection_mode='multiple',
+            pre_selected_rows=list(range(uploaded_files_df.shape[0])) if uploaded_files_df.iloc[-1, 0] != "" else [],
+            use_checkbox=True,
+        )
+        uploaded_files_grid_options_builder.configure_column("citation string", editable=True)
+        uploaded_files_grid_options_builder.configure_auto_height()
+        uploaded_files_grid_options = uploaded_files_grid_options_builder.build()
+        with st.sidebar:
+            uploaded_files_ag_grid = AgGrid(
+                uploaded_files_df,
+                gridOptions=uploaded_files_grid_options,
+                update_mode=GridUpdateMode.SELECTION_CHANGED | GridUpdateMode.VALUE_CHANGED,
+            )
+        app.params['uploaded_files_rows'] = uploaded_files_ag_grid["selected_rows"]
+    app.params['urls_df'] = pd.DataFrame()
+    if app.params['source_id'] == 3:
+        st.sidebar.markdown("""5. Write some urls and modify citation strings if you want (to look prettier)""")
+        # option 1: with streamlit version 1.20.0+
+        # app.params['urls_df'] = st.sidebar.experimental_data_editor(
+        #     pd.DataFrame([["", ""]], columns=['urls', 'citation string']),
+        #     use_container_width=True,
+        #     num_rows="dynamic",
+        # )
+        # option 2: with streamlit version 1.19.0
+        urls_dataset = [["", ""],
+                        ["", ""],
+                        ["", ""],
+                        ["", ""],
+                        ["", ""]]
+        urls_df = pd.DataFrame(
+            urls_dataset,
+            columns=['urls', 'citation string'])
+        urls_grid_options_builder = GridOptionsBuilder.from_dataframe(urls_df)
+        urls_grid_options_builder.configure_columns(['urls', 'citation string'], editable=True)
+        urls_grid_options_builder.configure_auto_height()
+        urls_grid_options = urls_grid_options_builder.build()
+        with st.sidebar:
+            urls_ag_grid = AgGrid(
+                urls_df,
+                gridOptions=urls_grid_options,
+                update_mode=GridUpdateMode.SELECTION_CHANGED | GridUpdateMode.VALUE_CHANGED,
+            )
+        df = urls_ag_grid.data
+        df = df[df.urls != ""]
+        app.params['urls_df'] = df
+    if app.params['source_id'] in (1, 2, 3):
+        st.sidebar.markdown("""6. Build an index where you can ask""")
+        api_keys_ready = check_api_keys()
+        source_ready = check_sources()
+        enable_index_button = api_keys_ready and source_ready
+        if st.sidebar.button("Build index", disabled=not enable_index_button):
+            collect_dataset_and_built_index()
+def main():
+    configure_streamlit_and_page()
+    load_sidebar_page()
+    load_main_page()
+def on_enter():
+    output = get_answer()
+    if output:
+        st.session_state.past.append(st.session_state['user_input'])
+        st.session_state.generated.append(output.answer)
+        st.session_state.contexts.append(output.context)
+        st.session_state.chunks.append(output.chunks)
+        st.session_state.costs.append(output.cost_str)
+        st.session_state['user_input'] = ""
+def request_pathname(files):
+    if not files:
+        return [["", ""]]
+    # check if temporal directory exist, if not create it
+    if not Path.exists(TEMP_DIR):
+        TEMP_DIR.mkdir(
+            parents=True,
+            exist_ok=True,
+        )
+    file_paths = []
+    for file in files:
+        # # absolut path
+        # file_path = str(TEMP_DIR / file.name)
+        # relative path
+        file_path = str((TEMP_DIR / file.name).relative_to(ROOT_DIR))
+        file_paths.append(file_path)
+        with open(file_path, "wb") as f:
+            f.write(file.getbuffer())
+    return [[filepath, filename.name] for filepath, filename in zip(file_paths, files)]
+def validate_status():
+    source_point_ready = check_source_point()
+    combination_point_ready = check_combination_point()
+    index_point_ready = check_index_point()
+    params_point_ready = check_params_point()
+    sources_ready = check_sources()
+    index_ready = check_index()
+    if source_point_ready and combination_point_ready and index_point_ready and params_point_ready and sources_ready and index_ready:
+        app.params['status'] = "✨Ready✨"
+    elif not source_point_ready:
+        app.params['status'] = "⚠️Review step 1 on the sidebar."
+    elif not combination_point_ready:
+        app.params['status'] = "⚠️Review step 2 on the sidebar. API Keys or endpoint, ..."
+    elif not index_point_ready:
+        app.params['status'] = "⚠️Review step 3 on the sidebar. Index API Key or environment."
+    elif not params_point_ready:
+        app.params['status'] = "⚠️Review step 4 on the sidebar"
+    elif not sources_ready:
+        app.params['status'] = "⚠️Review step 5 on the sidebar. Waiting for some source..."
+    elif not index_ready:
+        app.params['status'] = "⚠️Review step 6 on the sidebar. Waiting for press button to create index ..."
+    else:
+        app.params['status'] = "⚠️Something is not ready..."
+class StreamlitLangchainChatApp():
+    def __init__(self) -> None:
+        """Use __init__ to define instance variables. It cannot have any arguments."""
+        self.params = dict()
+    def run(self, **state) -> None:
+        """Define here all logic required by your application."""
+        main()
+if __name__ == "__main__":
+    app = StreamlitLangchainChatApp()
+    app.run()

streamlit_langchain_chat/utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import math
+import string
+def maybe_is_text(s, thresh=2.5):
+    if len(s) == 0:
+        return False
+    # Calculate the entropy of the string
+    entropy = 0
+    for c in string.printable:
+        p = s.count(c) / len(s)
+        if p > 0:
+            entropy += -p * math.log2(p)
+    # Check if the entropy is within a reasonable range for text
+    if entropy > thresh:
+        return True
+    return False
+def maybe_is_code(s):
+    if len(s) == 0:
+        return False
+    # Check if the string contains a lot of non-ascii characters
+    if len([c for c in s if ord(c) > 128]) / len(s) > 0.1:
+        return True
+    return False
+def strings_similarity(s1, s2):
+    if len(s1) == 0 or len(s2) == 0:
+        return 0
+    # break the strings into words
+    s1 = set(s1.split())
+    s2 = set(s2.split())
+    # return the similarity ratio
+    return len(s1.intersection(s2)) / len(s1.union(s2))
+def maybe_is_truncated(s):
+    punct = [".", "!", "?", '"']
+    if s[-1] in punct:
+        return False
+    return True
+def maybe_is_html(s):
+    if len(s) == 0:
+        return False
+    # check for html tags
+    if "<body" in s or "<html" in s or "<div" in s:
+        return True