# remove punc, segment and stopword def punc_jieba(text, sep = ' '): # stopword = stopwords(["zh"]) text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()!,❤。~《》:()【】「」?”“;:、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"), "",text) text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower() # tokens = word_tokenize(text_cut) # clean_text = [word for word in tokens if not word in stopword] return text_cut # mothod1 def stop_word(text): stopword = stopwords(['zh']) remove_stw = [word for word in text if not word in stopword] return remove_stw df['text'] = df['text'].apply(stop_word) # mothod2 stopword = stopwords(['zh']) df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter