thiscodeWorks - Organizing the best of code online

# https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/
df['TIME'] =  pd.to_datetime(df['Time'],unit='s')
df_time = df.set_index('TIME')
# Add columns with year, month, and Weekday Name
df_time['Year'] = df_time.index.year
df_time['Month'] = df_time.index.month
df_time['Weekday Name'] = df_time.index.weekday_name

# Display a random sampling of 5 rows
df_time.sample(5, random_state=0)

# Visualizing time series data
sns.set(rc={'figure.figsize':(11, 4)})

#nlp #pos #ngram

Retrain spacy tagger pos

from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
tagger = Tagger(vocab)
doc = Doc(vocab, words=['I', 'like', 'stuff'])
gold = GoldParse(doc, tags=['N', 'V', 'N'])
tagger.update(doc, gold)
tagger.model.end_training()

#nlp #pos #ngram

n-gram with filter POS tag

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=12) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Define functions for stopwords, bigrams, trigrams and lemmatization

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
python3 -m spacy download en
nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_words_bigrams[:1])


##################################################################
file=open(product_name,'w');
bags=nltk.bigrams(tagged_sentences)
distribution = nltk.FreqDist(bags)
c = Counter(distribution)
for k,count in c.most_common():
  if ((k[0][1])=='JJ')):
    do something...
###########################################################
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(df['species'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

df['species_tokens'] = tokens
df['species_lemma'] = lemma
df['species_pos'] = pos

working with time series snippets

Retrain spacy tagger pos

n-gram with filter POS tag

Save snippets that work with our extensions