Spaces:

rohan13
/

canvas-discussion-grader-with-feedback

Runtime error

App Files Files Community

rohan13 commited on Jul 28, 2023

Commit

0cad0b3

1 Parent(s): 028ac25

Custom csv loader

Browse files

Files changed (1) hide show

custom_csv_loader.py +114 -0

custom_csv_loader.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import csv
+from typing import Any, Dict, List, Optional
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.unstructured import (
+    UnstructuredFileLoader,
+    validate_unstructured_version,
+)
+class CSVLoader(BaseLoader):
+    """Loads a CSV file into a list of documents.
+    Each document represents one row of the CSV file. Every row is converted into a
+    key/value pair and outputted to a new line in the document's page_content.
+    The source for each document loaded from csv is set to the value of the
+    `file_path` argument for all documents by default.
+    You can override this by setting the `source_column` argument to the
+    name of a column in the CSV file.
+    The source of each document will then be set to the value of the column
+    with the name specified in `source_column`.
+    Output Example:
+        .. code-block:: txt
+            column1: value1
+            column2: value2
+            column3: value3
+    """
+    def __init__(
+            self,
+            file_path: str,
+            source_column: Optional[str] = None,
+            csv_args: Optional[Dict] = None,
+            encoding: Optional[str] = None,
+    ):
+        """
+        Args:
+            file_path: The path to the CSV file.
+            source_column: The name of the column in the CSV file to use as the source.
+              Optional. Defaults to None.
+            csv_args: A dictionary of arguments to pass to the csv.DictReader.
+              Optional. Defaults to None.
+            encoding: The encoding of the CSV file. Optional. Defaults to None.
+        """
+        self.file_path = file_path
+        self.source_column = source_column
+        self.encoding = encoding
+        self.csv_args = csv_args or {}
+    def load(self) -> List[Document]:
+        """Load data into document objects."""
+        docs = []
+        with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
+            csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
+            for i, row in enumerate(csv_reader):
+                content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items() if k != 'restkey')
+                try:
+                    source = (
+                        row[self.source_column]
+                        if self.source_column is not None
+                        else self.file_path
+                    )
+                except KeyError:
+                    raise ValueError(
+                        f"Source column '{self.source_column}' not found in CSV file."
+                    )
+                metadata = {"source": source, "row": i}
+                doc = Document(page_content=content, metadata=metadata)
+                docs.append(doc)
+        return docs
+class UnstructuredCSVLoader(UnstructuredFileLoader):
+    """Loader that uses unstructured to load CSV files. Like other
+    Unstructured loaders, UnstructuredCSVLoader can be used in both
+    "single" and "elements" mode. If you use the loader in "elements"
+    mode, the CSV file will be a single Unstructured Table element.
+    If you use the loader in "elements" mode, an HTML representation
+    of the table will be available in the "text_as_html" key in the
+    document metadata.
+    Examples
+    --------
+    from langchain.document_loaders.csv_loader import UnstructuredCSVLoader
+    loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements")
+    docs = loader.load()
+    """
+    def __init__(
+            self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
+    ):
+        """
+        Args:
+            file_path: The path to the CSV file.
+            mode: The mode to use when loading the CSV file.
+              Optional. Defaults to "single".
+            **unstructured_kwargs: Keyword arguments to pass to unstructured.
+        """
+        validate_unstructured_version(min_unstructured_version="0.6.8")
+        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
+    def _get_elements(self) -> List:
+        from unstructured.partition.csv import partition_csv
+        return partition_csv(filename=self.file_path, **self.unstructured_kwargs)