Preview:
# print('Input DataSet Name')
# dataset = input()
# print('Input Number of Classes')
# classes = int(input())
# dataset_path = 'pre_processed_df/' + 'pre_processed_' + dataset + '.csv'

# clean text and seg
def preprocessingTextFull(text, sep = ' '):
    text = text.lower()
    text = re.sub(r'<', '', text) #remove '<' tag
    text = re.sub(r'<.*?>', '', text) #remove html
    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation
    # remove stopword
    stop_words = list(stopwords(["zh"]))
    more_s = ['请问','谢谢您','谢谢你''谢谢','您好','_']
    stop = stop_words + more_s
    text = "".join([word for word in text if word not in stop]) #remove stopwords
    
    for c in ['\r', '\n', '\t'] :
        text = re.sub(c, ' ', text) #replace newline and tab with tabs\
        text = re.sub('\s+', ' ', text) #replace multiple spaces with one space
#         text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    text_cut = sep.join(jieba.cut(text, cut_all=False))
        
    return text_cut

# more_s = ['请问', '没', 'kiehls', 'linepaymoney','谢谢您','谢谢你''谢谢','您好', '姓名','元', '电话', '手机', 'line', 'pay', 'money','不能', '一下', '需要','linepay', '今天', '现在', '最近','_','公司','point','没有']
#     text = re.sub(r'[0-9]+', '', text) #remove number
#     text = re.sub(r'[^\w\s]', '', text) #remove punctiation
#     text = re.sub('[^\u4e00-\u9fa5]+', ' ', text) # remove ASCII strings
#   text = re.sub(r'[^\x00-\x7f]', '', text) #remove non ASCII strings
#    text = re.sub("[\@\-\;\>\<\:\?\.\!\/_,$%^(\"\']+" , ' ' , text) #remove punctiation, keep ****
downloadDownload PNG downloadDownload JPEG downloadDownload SVG

Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!

Click to optimize width for Twitter