remove punc and stopword chinese
Fri Aug 13 2021 06:21:01 GMT+0000 (Coordinated Universal Time)
Saved by
@QuinnFox12
#python
#pandas
#column
#nlp
#chinese
#trasim
# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
# stopword = stopwords(["zh"])
text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()!,❤。~《》:()【】「」?”“;:、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
"",text)
text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
# tokens = word_tokenize(text_cut)
# clean_text = [word for word in tokens if not word in stopword]
return text_cut
# mothod1
def stop_word(text):
stopword = stopwords(['zh'])
remove_stw = [word for word in text if not word in stopword]
return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))
content_copyCOPY
Comments