preprocessing Text Full and path
Wed Aug 25 2021 17:43:54 GMT+0000 (Coordinated Universal Time)
Saved by @QuinnFox12 #chinese #visualization
# print('Input DataSet Name') # dataset = input() # print('Input Number of Classes') # classes = int(input()) # dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv' # clean text and seg def preprocessingTextFull(text, sep = ' '): text = text.lower() text = re.sub(r'<', '', text) #remove '<' tag text = re.sub(r'<.*?>', '', text) #remove html text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation # remove stopword stop_words = list(stopwords(["zh"])) more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_'] stop = stop_words + more_s text = "".join([word for word in text if word not in stop]) #remove stopwords for c in ['\r', '\n', '\t'] : text = re.sub(c, ' ', text) #replace newline and tab with tabs\ text = re.sub('\s+', ' ', text) #replace multiple spaces with one space # text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()]) text_cut = sep.join(jieba.cut(text, cut_all=False)) return text_cut #________________________________________________________________________________ def clean_text(text, tokenizer, stopwords): """Pre-process text and generate tokens Args: text: Text to tokenize. Returns: Tokenized text. """ text = str(text).lower() # Lowercase words text = re.sub(r"\[(.*?)\]", "", text) # Remove [+XYZ chars] in content text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content text = re.sub(r"\w+…|…", "", text) # Remove ellipsis (and last word) text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words text = re.sub( f"[{re.escape(string.punctuation)}]", "", text ) # Remove punctuation tokens = tokenizer(text) # Get tokens from text tokens = [t for t in tokens if not t in stopwords] # Remove stopwords tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits tokens = [t for t in tokens if len(t) > 1] # Remove short tokens return tokens #________________________________________________________________________________ import re def cleanResume(resumeText): resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc resumeText = re.sub('#\S+', '', resumeText) # remove hashtags resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) # remove punctuations resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace return resumeText resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x)) # more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有'] # text = re.sub(r'[0-9]+', '', text) #remove number # text = re.sub(r'[^\w\s]', '', text) #remove punctiation # text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings # text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings # text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****
Comments