Train Fasttext - GloVe algorithm on my own corpus

PHOTO

Thu Apr 21 2022 17:21:57 GMT+0000 (Coordinated Universal Time)

Saved by @QuinnFox12 #pandas #list #group

#You can do it using GloVe library:

#Install it: 

!pip install glove_python

from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(lines, window=10)

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
 Save

 
 
 #for Fasttext
 from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
>>>
print(common_texts[0])
['human', 'interface', 'computer']
print(len(common_texts))
9
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

import numpy as np
>>>
np.allclose(model.wv['computer'], model2.wv['computer'])
True


from gensim.test.utils import datapath
>>>
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
>>>
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)


from gensim.utils import tokenize
from gensim import utils
>>>
>>>
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
>>>
>>>
model4 = FastText(vector_size=4, window=3, min_count=1)
model4.build_vocab(sentences=MyIter())
total_examples = model4.corpus_count
model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
from gensim.test.utils import get_tmpfile
>>>
fname = get_tmpfile("fasttext.model")
>>>
model.save(fname)
model = FastText.load(fname)


# https://radimrehurek.com/gensim/models/fasttext.html

COPY

https://medium.com/analytics-vidhya/word-vectorization-using-glove-76919685ee0b

Save snippets that work from anywhere online with our extensions

Comments

python code

@QuinnFox12

.partition() (picking up piece of string between separators) findall and search text between 2 strings Combine columns in dataframe create and save dataframe to csv replace function in dataframe apply function and def Read filenames in folder Python regex related snippets create and save dataframe to csv mount Drive and colab Print function working with time series snippets Train Fasttext - GloVe algorithm on my own corpus Extract Tables from PDFs or website Remove blank cell in pandas dataframe time in pandas dataframe python vlookup dataframe dataframe all related

embedding

@QuinnFox12

n-gram with filter POS tag Train Fasttext - GloVe algorithm on my own corpus

NLP

@QuinnFox12

convert tra to sim chinese remove punc and stopword chinese Chinese POS most common words for each sector and visualize preprocessing Text Full and path convert dataframe to txt, to list preprocessing Text Full and path colab common useful snippets multi txt to pandas convert stopword list from sim to tra Pandas selection iloc loc note n-gram with filter POS tag Retrain spacy tagger pos Train Fasttext - GloVe algorithm on my own corpus Extract Text From PDF with Python tạo wordcloud

embedding

@QuinnFox12

n-gram with filter POS tag Train Fasttext - GloVe algorithm on my own corpus

#python #pandas #data-cleaning

Forward fills Column names

def ffill_cols(df, cols_to_fill_name='Unn'):
    """
    Forward fills column names. Propagate last valid column name forward to next invalid column. Works similarly to pandas
    ffill().
    
    :param df: pandas Dataframe; Dataframe
    :param cols_to_fill_name: str; The name of the columns you would like forward filled. Default is 'Unn' as
    the default name pandas gives unnamed columns is 'Unnamed'
    
    :returns: list; List of new column names
    """
    cols = df.columns.to_list()
    for i, j in enumerate(cols):
        if j.startswith(cols_to_fill_name):
            cols[i] = cols[i-1]
    return cols

#python #python #math #list

Returns the maximum value of a list in Python

def max_by(lst, fn):
  return max(map(fn, lst))

#python #datetime #pandas

Convert Excel Days Date to real date Pandas

df['SettlementDate'] = pd.TimedeltaIndex(df['SettlementDate'], unit='d') + dt.datetime(1900,1,1)

#python #pandas

function to match strings in inconsistent data

# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

#python #pandas #dataframe #crossjoin

Cross Join in Pandas

# Setup fake key in both DataFrames to join on it

df1['key'] = 0
df2['key'] = 0

df1.merge(df2, on='key', how='outer')

#authentification #permissions #group

User, group and permission overview

# Check if 'webmasters' group exist
cat /etc/group | grep webmasters

# Create 'webmasters' group
sudo addgroup webmasters

# Add users to 'webmasters' group
sudo usermod -a -G webmasters username

# INFO: Group assignment changes won't take effect until the users log out and back in.

# Change group owner of the directory to webmaster user
sudo chgrp -R webmasters /etc/nginx/
  
# Give write permission to the group
sudo chmod -R g+w /etc/nginx/
  
# Create file as different user
sudo -u username touch /etc/nginx/test.txt


# When using a number mask for permission representation there are only a few basic permissions

4: Read
2: Write
1: Execute

# Combined you get this table

+-----+---+--------------------------+
| rwx | 7 | read    write   execute  |
| rw- | 6 | read    write            |
| r-x | 5 | read            execute  |
| r-- | 4 | read                     |
| -wx | 3 |         write   execute  |
| -w- | 2 |         write            |
| --x | 1 |                 execute  |
| --- | 0 |                          |
+------------------------------------+
  
# The permissions for user, group and other are listet after each other when looking them up

+------------+------+-------+
| Permission | Octal| Field |
+------------+------+-------+
| rwx------  | 700  | User  |
| ---rwx---  | 070  | Group |
| ------rwx  | 007  | Other |
+------------+------+-------+
  
# This boils down to this
  
+------------------------+-----------+--------------------------------------+
| chmod u=rwx,g=rwx,o=rx | chmod 775 | For world readable directories		|
|                        |           |   Members of group can change files	|
| chmod u=rwx,g=rx,o=    | chmod 750 | For group readable directories		|
|                        |           |   Members of group can change files	|
| chmod u=rwx,go=        | chmod 700 | For private direcories				|
+------------------------+-----------+--------------------------------------+

#pandas #list #group

group by the first column and get second column as lists in rows:

In [1]: df = pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})
        df

Out[1]: 
   a  b
0  A  1
1  A  2
2  B  5
3  B  5
4  B  4
5  C  6

In [2]: df.groupby('a')['b'].apply(list)
Out[2]: 
a
A       [1, 2]
B    [5, 5, 4]
C          [6]
Name: b, dtype: object

In [3]: df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
        df1
Out[3]: 
   a        new
0  A     [1, 2]
1  B  [5, 5, 4]
2  C        [6]

#pandas #list #group

Train Fasttext - GloVe algorithm on my own corpus

#You can do it using GloVe library:

#Install it: 

!pip install glove_python

from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(lines, window=10)

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
 Save

 
 
 #for Fasttext
 from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
>>>
print(common_texts[0])
['human', 'interface', 'computer']
print(len(common_texts))
9
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

import numpy as np
>>>
np.allclose(model.wv['computer'], model2.wv['computer'])
True


from gensim.test.utils import datapath
>>>
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
>>>
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)


from gensim.utils import tokenize
from gensim import utils
>>>
>>>
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
>>>
>>>
model4 = FastText(vector_size=4, window=3, min_count=1)
model4.build_vocab(sentences=MyIter())
total_examples = model4.corpus_count
model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
from gensim.test.utils import get_tmpfile
>>>
fname = get_tmpfile("fasttext.model")
>>>
model.save(fname)
model = FastText.load(fname)


# https://radimrehurek.com/gensim/models/fasttext.html

#python #in #list

Python in statement

# Python program to illustrate 
# Finding common member in list  
# using 'in' operator 
list1=[1,2,3,4,5] 
list2=[6,7,8,9] 
for item in list1: 
    if item in list2: 
        print("overlapping")       
else: 
    print("not overlapping")

#servicenow #catalog #servicecatalog #variables #list #listview #view

PSA: How to View Variables in Lists - Developer Community - ServiceNow Community

PSA: How to View Variables in Lists
by
Bradford Shelley
Forum Level 2
created 4y ago (edited 3y ago ) in Developer Community
After having to play around with variables quite a bit in a recent project, I thought I'd share how to display variables on a list of Requested Items / Catalog Tasks. This applies to lists and related lists, as reports have their own method of displaying variables. Important note: This was performed on Fuji. Your experience may differ on older versions of ServiceNow.

Step 1 Identify the variables you'd like to display on your list, then copy the sys_id for each variable. This is as simple as heading to the Catalog Item, and jumping into the variable(s) in question. We'll need the sys_id to add the column into the list.

Step 2 Head over to System UI -> Lists

Step 3 Identify the list you'd like to display the variable(s) on. We're looking for one of two things here. Either the name of the view of the list you'd like to include the variable(s) on (I highly recommend creating a new view instead of using the Default view for this, as most likely the variables will apply to certain catalog items instead of every single one), or for a related list look at the Parent and Relationship columns for the table and name of the tab for the related list. Head into the list when you've found it.

Step 4 Create a new List Element(s) from the related list at, using "variables.sys_id" (no quotes) in the actual Element field on the List Element form. Create a new List Element for each variable you wish to add.

Step 5 Head over to the list where variables were just added. They won't show up immediately, so don't panic. Edit the list layout, and move around the variables to the spots you'd like them in the list (even if the position is already how you'd like it, move a variable one position up/down, then move it back and save). At this point, the variables should be visible!

Hope this helps at least one person out there. Take care!

#python #list #dictionary

Create a List-of-Lists from a Dictionary

fruit = {
  "elderberries": 1,
  "figs": 1,
  "apples": 2,
  "durians": 3,
  "bananas": 5,
  "cherries": 8,
  "grapes": 13
}

table_data = []
for k, v in fruit.items():
   table_data.append([k, v])

#lazycolumn #list #jetpackcompose

Control Scrolling in Jetpack Compose

Row {
    Button(onClick = {
        coroutineScope.launch {
            // 0 is the first item index
            scrollState.animateScrollToItem(0)
        }
    }) {
        Text("Scroll to the top")
    }

    Button(onClick = {
        coroutineScope.launch {
            // listSize - 1 is the last index of the list
            scrollState.animateScrollToItem(listSize - 1)
        }
    }) {
        Text("Scroll to the end")
    }
}

#numpy #booleans_in_numpy #list #forloop #for #loop

Loop over lists of lists

# house list of lists
house = [["hallway", 11.25], 
         ["kitchen", 18.0], 
         ["living room", 20.0], 
         ["bedroom", 10.75], 
         ["bathroom", 9.50]]
         
# Build a for loop from scratch
for x in house :
    #x[0] to access name of room
    #x[1] to access area in sqm
    print('the ' + x[0] + " is " + str(x[1]) + " sqm")

#list #forloop #for #loop ##dictionary

Loop over dictionary

# Definition of dictionary
europe = {'spain':'madrid', 'france':'paris', 'germany':'berlin',
          'norway':'oslo', 'italy':'rome', 'poland':'warsaw', 'austria':'vienna' }
          
# Iterate over europe
for key, value in europe.items() :
    print('the capital of ' + str(key) + ' is ' + str(value))

#list #forloop #for #loop ##dictionary

Loop over numpy array

# Import numpy as np

import numpy as np
#for x in my_array : #in 1D Numpy array
#for x in np.nditer(my_array) : #for 2D Numpy array

# For loop over np_height

for x in np_height:
    print(str(x) + " inches")

# For loop over np_baseball
for x in (np.nditer(np_baseball)):
    print(x)

Train Fasttext - GloVe algorithm on my own corpus

Save snippets that work from anywhere online with our extensions

Comments

More like this

python code

embedding

NLP

embedding

Browse more snippets >>

Train Fasttext - GloVe algorithm on my own corpus

Save snippets that work from anywhere online with our extensions

Comments

More like this

python code

embedding

NLP

embedding

Browse more snippets >>

Embed code snippet