# print('Input DataSet Name')
# dataset = input()
# print('Input Number of Classes')
# classes = int(input())
# dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv'
# clean text and seg
def preprocessingTextFull(text, sep = ' '):
text = text.lower()
text = re.sub(r'<', '', text) #remove '<' tag
text = re.sub(r'<.*?>', '', text) #remove html
text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation
# remove stopword
stop_words = list(stopwords(["zh"]))
more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_']
stop = stop_words + more_s
text = "".join([word for word in text if word not in stop]) #remove stopwords
for c in ['\r', '\n', '\t'] :
text = re.sub(c, ' ', text) #replace newline and tab with tabs\
text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
# text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
text_cut = sep.join(jieba.cut(text, cut_all=False))
return text_cut
#________________________________________________________________________________
def clean_text(text, tokenizer, stopwords):
"""Pre-process text and generate tokens
Args:
text: Text to tokenize.
Returns:
Tokenized text.
"""
text = str(text).lower() # Lowercase words
text = re.sub(r"\[(.*?)\]", "", text) # Remove [+XYZ chars] in content
text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
text = re.sub(r"\w+…|…", "", text) # Remove ellipsis (and last word)
text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words
text = re.sub(
f"[{re.escape(string.punctuation)}]", "", text
) # Remove punctuation
tokens = tokenizer(text) # Get tokens from text
tokens = [t for t in tokens if not t in stopwords] # Remove stopwords
tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits
tokens = [t for t in tokens if len(t) > 1] # Remove short tokens
return tokens
#________________________________________________________________________________
import re
def cleanResume(resumeText):
resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc
resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions
resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) # remove punctuations
resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace
return resumeText
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
# more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有']
# text = re.sub(r'[0-9]+', '', text) #remove number
# text = re.sub(r'[^\w\s]', '', text) #remove punctiation
# text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings
# text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
# text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****
Comments