# Build the bigram and trigram models bigram = gensim.models.Phrases(data_words, min_count=5, threshold=12) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # See trigram example print(trigram_mod[bigram_mod[data_words[0]]]) nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner']) # Define functions for stopwords, bigrams, trigrams and lemmatization def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(data_words) # Form Bigrams data_words_bigrams = make_bigrams(data_words) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) python3 -m spacy download en nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) print(data_words_bigrams[:1]) ################################################################## file=open(product_name,'w'); bags=nltk.bigrams(tagged_sentences) distribution = nltk.FreqDist(bags) c = Counter(distribution) for k,count in c.most_common(): if ((k[0][1])=='JJ')): do something... ########################################################### tokens = [] lemma = [] pos = [] for doc in nlp.pipe(df['species'].astype('unicode').values, batch_size=50, n_threads=3): if doc.is_parsed: tokens.append([n.text for n in doc]) lemma.append([n.lemma_ for n in doc]) pos.append([n.pos_ for n in doc]) else: # We want to make sure that the lists of parsed results have the # same number of entries of the original Dataframe, so add some blanks in case the parse fails tokens.append(None) lemma.append(None) pos.append(None) df['species_tokens'] = tokens df['species_lemma'] = lemma df['species_pos'] = pos
Preview:
downloadDownload PNG
downloadDownload JPEG
downloadDownload SVG
Tip: You can change the style, width & colours of the snippet with the inspect tool before clicking Download!
Click to optimize width for Twitter