Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	File size: 2,018 Bytes
			
			| 9833a80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 | #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on 
@author:  
@title: clean_dataset
@descriptions: set of functions that enable splitting and cleaning.
"""
#%%
import pandas as pd
import numpy as np
import string
from itertools import chain
from textwrap3 import wrap
import re
def split_at_length(dataframe, column, length, title = True):
    wrapped = []
    for i in dataframe[column]:
        wrapped.append(wrap(str(i), length))
    dataframe = dataframe.assign(wrapped=wrapped)
    dataframe['wrapped'] = dataframe['wrapped'].apply(lambda x: '; '.join(map(str, x)))
    if title == True:
        splitted = pd.concat([pd.Series(row['title'], row['wrapped'].split("; "), )              
                            for _, row in dataframe.iterrows()]).reset_index()
        splitted = splitted.rename(columns={"index": "text", 0: "title"})
    else:
        splitted = []   
    
    
    return dataframe, splitted
def basic(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    s = s.lower() 
    # Remove punctuation
    translator = str.maketrans(' ', ' ', string.punctuation) 
    s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    s = re.sub(r'\d+', ' ', s) 
    s = re.sub(r'\W+', ' ', s)
    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()
def remove_linebreaks(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
    
    return s.strip() |