n-gram with filter POS tag

PHOTO

Mon Oct 18 2021 07:07:09 GMT+0000 (Coordinated Universal Time)

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=12) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Define functions for stopwords, bigrams, trigrams and lemmatization

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
python3 -m spacy download en
nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_words_bigrams[:1])


##################################################################
file=open(product_name,'w');
bags=nltk.bigrams(tagged_sentences)
distribution = nltk.FreqDist(bags)
c = Counter(distribution)
for k,count in c.most_common():
  if ((k[0][1])=='JJ')):
    do something...
###########################################################
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(df['species'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

df['species_tokens'] = tokens
df['species_lemma'] = lemma
df['species_pos'] = pos

COPY

https://www.thiscodeworks.com/n-gram-with-filter-pos-tag-nlp-pos-ngram/616d1d1da634b8001500613b

Save snippets that work from anywhere online with our extensions

Comments

text-preprocessing

@QuinnFox12

.partition() (picking up piece of string between separators) findall and search text between 2 strings apply function and def Python regex related snippets convert tra to sim chinese remove punc and stopword chinese most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list Working with null nan nul preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra n-gram with filter POS tag Retrain spacy tagger pos working with time series snippets Extract Text From PDF with Python time in pandas dataframe python vlookup dataframe remove blank string in list check if 2 strings in string

embedding

@QuinnFox12

n-gram with filter POS tag Train Fasttext - GloVe algorithm on my own corpus

NLP

@QuinnFox12

convert tra to sim chinese remove punc and stopword chinese Chinese POS most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra Pandas selection iloc loc note n-gram with filter POS tag Retrain spacy tagger pos Train Fasttext - GloVe algorithm on my own corpus Extract Text From PDF with Python tạo wordcloud

embedding

@QuinnFox12

n-gram with filter POS tag Train Fasttext - GloVe algorithm on my own corpus

#python #textpreprocessing #nlp

.partition() (picking up piece of string between separators)

# picking up piece of string between separators
# function using partition, like partition, but drops the separators
def between(left,right,s):
    before,_,a = s.partition(left)
    a,_,after = a.partition(right)
    return before,a,after
 
s = "bla bla blaa <a>data</a> lsdjfasdjöf (important notice) 'Daniweb forum' tcha tcha tchaa"
print between('<a>','</a>',s)
print between('(',')',s)
print between("'","'",s)
 
""" Output:
('bla bla blaa ', 'data', " lsdjfasdj\xc3\xb6f (important notice) 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f ', 'important notice', " 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f (important notice) ', 'Daniweb forum', ' tcha tcha tchaa')
"""

#nlp #pos #ngram

n-gram with filter POS tag

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=12) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Define functions for stopwords, bigrams, trigrams and lemmatization

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
python3 -m spacy download en
nlp = spacy.load('zh_core_web_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_words_bigrams[:1])


##################################################################
file=open(product_name,'w');
bags=nltk.bigrams(tagged_sentences)
distribution = nltk.FreqDist(bags)
c = Counter(distribution)
for k,count in c.most_common():
  if ((k[0][1])=='JJ')):
    do something...
###########################################################
tokens = []
lemma = []
pos = []

for doc in nlp.pipe(df['species'].astype('unicode').values, batch_size=50,
                        n_threads=3):
    if doc.is_parsed:
        tokens.append([n.text for n in doc])
        lemma.append([n.lemma_ for n in doc])
        pos.append([n.pos_ for n in doc])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original Dataframe, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)

df['species_tokens'] = tokens
df['species_lemma'] = lemma
df['species_pos'] = pos

#nlp #pos #ngram

Retrain spacy tagger pos

from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse
vocab = Vocab(tag_map={'N': {'pos': 'NOUN'}, 'V': {'pos': 'VERB'}})
tagger = Tagger(vocab)
doc = Doc(vocab, words=['I', 'like', 'stuff'])
gold = GoldParse(doc, tags=['N', 'V', 'N'])
tagger.update(doc, gold)
tagger.model.end_training()

#nlp #pos #ngram

working with time series snippets

# https://www.dataquest.io/blog/tutorial-time-series-analysis-with-pandas/
df['TIME'] =  pd.to_datetime(df['Time'],unit='s')
df_time = df.set_index('TIME')
# Add columns with year, month, and Weekday Name
df_time['Year'] = df_time.index.year
df_time['Month'] = df_time.index.month
df_time['Weekday Name'] = df_time.index.weekday_name

# Display a random sampling of 5 rows
df_time.sample(5, random_state=0)

# Visualizing time series data
sns.set(rc={'figure.figsize':(11, 4)})

#textpreprocessing #nlp #py

.partition() (picking up piece of string between separators)

# picking up piece of string between separators
# function using partition, like partition, but drops the separators
def between(left,right,s):
    before,_,a = s.partition(left)
    a,_,after = a.partition(right)
    return before,a,after

s = "bla bla blaa <a>data</a> lsdjfasdjöf (important notice) 'Daniweb forum' tcha tcha tchaa"
print between('<a>','</a>',s)
print between('(',')',s)
print between("'","'",s)

""" Output:
('bla bla blaa ', 'data', " lsdjfasdj\xc3\xb6f (important notice) 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f ', 'important notice', " 'Daniweb forum' tcha tcha tchaa")
('bla bla blaa <a>data</a> lsdjfasdj\xc3\xb6f (important notice) ', 'Daniweb forum', ' tcha tcha tchaa')
"""

#textpreprocessing #nlp #py

PyParsing - extract the substring between two markers

import pyparsing as pp

word = pp.Word(pp.alphanums)

s = 'gfgfdAAA1234ZZZuijjk'
rule = pp.nestedExpr('AAA', 'ZZZ')
for match in rule.searchString(s):
    print(match)

#python #textpreprocessing #nlp

findall and search text between 2 strings

import re

text = 'this is a text'

try:
    found = re.search('is(.+?)text', text).group(1)
except AttributeError:
    # AAA, ZZZ not found in the original string
    found = '0 wtitle' # apply your error handling
found

=> a

# To get more than 1 search
job_title = []
for i in range(0,9282):
    text = data.work_experiences.iloc[i]
    try:
        title = re.findall(r"wtitle (.*?) wcompany",text)
    except :
        title = 'onejob'
    job_title.append(title)
    
data['job_title'] = job_title

#python #textpreprocessing #nlp

Combine columns in dataframe

# best way
data['resume'] = data[['Resume_title', 'City', 'State', 'Description', 'work_experiences', 'Educations', 'Skills', 'Certificates', 'Additional Information']].agg(' '.join, axis=1)


# other way
df["period"] = df["Year"] + df["quarter"]
df['Period'] = df['Year'] + ' ' + df['Quarter']
df["period"] = df["Year"].astype(str) + df["quarter"] #If one (or both) of the columns are not string typed
#Beware of NaNs when doing this!
df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1) #for multiple string columns
df['period'] = df[['Year', 'quarter']].apply(lambda x: ''.join(x), axis=1)
#method cat() of the .str accessor 
df['Period'] = df.Year.str.cat(df.Quarter)
df['Period'] = df.Year.astype(str).str.cat(df.Quarter.astype(str), sep='q')
df['AllTogether'] = df['Country'].str.cat(df[['State', 'City']], sep=' - ') #add parameter na_rep to replace the NaN values with a string if have nan
columns = ['whatever', 'columns', 'you', 'choose']
df['period'] = df[columns].astype(str).sum(axis=1)

#a function
def str_join(df, sep, *cols):
   ...:     from functools import reduce
   ...:     return reduce(lambda x, y: x.astype(str).str.cat(y.astype(str), sep=sep), 
   ...:                   [df[col] for col in cols])
   ...: 

In [4]: df['cat'] = str_join(df, '-', 'c0', 'c1', 'c2', 'c3')

#python #textpreprocessing #nlp #dataframe #pandas #save #export

create and save dataframe to csv

import pandas as pd

data = {'Product': ['Desktop Computer','Tablet','Printer','Laptop'],
        'Price': [850,200,150,1300]
        }

df = pd.DataFrame(data, columns= ['Product', 'Price'])

df.to_csv(r'Path where you want to store the exported CSV file\File Name.csv')
# df.to_csv('file_name.csv', encoding='utf-8', index=False)
print (df)

data[['column1','column2','column3',...]].to_csv('fileNameWhereYouwantToWrite.csv')
     
df = pd.DataFrame()
for i in range():
	#....
	df.appen(text)

#textpreprocessing #nlp #gnn #dataload #pytorch

A detailed example of data loaders with PyTorch

# Load entire dataset
X, y = torch.load('some_training_set_with_labels.pt')
 
# Train model
for epoch in range(max_epochs):
    for i in range(n_batches):
        # Local batches and labels
        local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]
 
        # Your model
        [...]
         
         
# other
# Unoptimized generator
training_generator = SomeSingleCoreGenerator('some_training_set_with_labels.pt')
 
# Train model
for epoch in range(max_epochs):
    for local_X, local_y in training_generator:
        # Your model
        [...]

#textpreprocessing #nlp #pandas #function

replace function in dataframe

for c in df_drop.columns:
    df_drop[c] = df_drop[c].str.replace('[^\w\s]+', '')
df_drop = df_drop.astype(str)
df_drop.head()

#textpreprocessing #nlp #function #pandas

apply function and def

rmsval = df.loc[:, 'c1':'c4']
def getrms(row):  
  a = np.sqrt(sum(row**2/4))
  return a
df['rms'] = df.apply(getrms,axis=1)
df.head()

#python #huggingface #nlp

How to delete a layer in pretrained model using Huggingface

def deleteEncodingLayers(model, num_layers_to_keep):  # must pass in the full bert model
    oldModuleList = model.bert.encoder.layer
    newModuleList = nn.ModuleList()

    # Now iterate over all layers, only keepign only the relevant layers.
    for i in range(0, len(num_layers_to_keep)):
        newModuleList.append(oldModuleList[i])

    # create a copy of the model, modify it with the new list, and return
    copyOfModel = copy.deepcopy(model)
    copyOfModel.bert.encoder.layer = newModuleList

    return copyOfModel

#python #pandas #column #nlp #chinese #trasim

convert tra to sim chinese

sudo pip install opencc
# if nt work, should clone project first

import pandas as pd
import numpy as np
# -*- coding: utf-8 -*-
import opencc
from opencc import OpenCC

df = pd.read_csv('training.csv').astype(str)

def tra_sim(text):
    cc = OpenCC('tw2s')
    sim = cc.convert(text)
    return sim
df['sim_label'] = df['label'].apply(tra_sim)
df['sim_detail_label'] = df['detail_label'].apply(tra_sim)
df['sim_text'] = df['text'].apply(tra_sim)

#python #pandas #column #nlp #chinese #trasim

remove punc and stopword chinese

# remove punc, segment and stopword
def punc_jieba(text, sep = ' '):
#     stopword = stopwords(["zh"])
    text_punc = re.sub("[\s+\>\<\:\?\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）！，❤。～《》：（）【】「」？”“；：、【】╮╯▽╰╭★→「」]+".encode().decode("utf8"),
                        "",text)
    text_cut = sep.join(jieba.cut(text_punc, cut_all=False)).lower()
#     tokens = word_tokenize(text_cut)
#     clean_text = [word for word in tokens if not word in stopword]
    
    return text_cut
# mothod1
def stop_word(text):
    stopword = stopwords(['zh'])
    remove_stw = [word for word in text if not word in stopword]
    return remove_stw
df['text'] = df['text'].apply(stop_word)
# mothod2
stopword = stopwords(['zh'])
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopword)]))

n-gram with filter POS tag

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

embedding

NLP

embedding

Browse more snippets >>

n-gram with filter POS tag

Save snippets that work from anywhere online with our extensions

Comments

More like this

text-preprocessing

embedding

NLP

embedding

Browse more snippets >>

Embed code snippet