Spaces:

peter2000
/

policy_test

Runtime error

peter2000 commited on Sep 27, 2022

Commit

c992322

1 Parent(s): daf15f4

Update scripts/process.py

Files changed (1) hide show

scripts/process.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import streamlit as st
 from typing import Callable, Dict, List, Optional
 import os
 from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
 from haystack.schema import Answer
@@ -26,7 +27,33 @@ os.environ['TOKENIZERS_PARALLELISM'] ="false"
 #    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
 #    document_store.write_documents(docs)
 def load_document(
     file_path: str,
     file_name,

 import streamlit as st
 from typing import Callable, Dict, List, Optional
+import re
 import os
 from haystack.utils import fetch_archive_from_http, clean_wiki_text, convert_files_to_docs
 from haystack.schema import Answer
 #    docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)
 #    document_store.write_documents(docs)
+def basic(s):
+    """
+    :param s: string to be processed
+    :return: processed string: see comments in the source code for more info
+    """
+    # Text Lowercase
+    #s = s.lower()
+    # Remove punctuation
+    #translator = str.maketrans(' ', ' ', string.punctuation)
+    #s = s.translate(translator)
+    # Remove URLs
+    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
+    s = re.sub(r"http\S+", " ", s)
+    # Remove new line characters
+    #s = re.sub('\n', ' ', s)
+    # Remove distracting single quotes
+    #s = re.sub("\'", " ", s)
+    # Remove all remaining numbers and non alphanumeric characters
+    #s = re.sub(r'\d+', ' ', s)
+    #s = re.sub(r'\W+', ' ', s)
+    # define custom words to replace:
+    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
+    return s.strip()
 def load_document(
     file_path: str,
     file_name,