thiscodeWorks - Organizing the best of code online

Train Fasttext - GloVe algorithm on my own corpus

#You can do it using GloVe library:

#Install it: 

!pip install glove_python

from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(lines, window=10)

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
 Save

 
 
 #for Fasttext
 from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
>>>
print(common_texts[0])
['human', 'interface', 'computer']
print(len(common_texts))
9
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

import numpy as np
>>>
np.allclose(model.wv['computer'], model2.wv['computer'])
True


from gensim.test.utils import datapath
>>>
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
>>>
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)


from gensim.utils import tokenize
from gensim import utils
>>>
>>>
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
>>>
>>>
model4 = FastText(vector_size=4, window=3, min_count=1)
model4.build_vocab(sentences=MyIter())
total_examples = model4.corpus_count
model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
from gensim.test.utils import get_tmpfile
>>>
fname = get_tmpfile("fasttext.model")
>>>
model.save(fname)
model = FastText.load(fname)


# https://radimrehurek.com/gensim/models/fasttext.html

#pandas #list #group

group by the first column and get second column as lists in rows:

In [1]: df = pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})
        df

Out[1]: 
   a  b
0  A  1
1  A  2
2  B  5
3  B  5
4  B  4
5  C  6

In [2]: df.groupby('a')['b'].apply(list)
Out[2]: 
a
A       [1, 2]
B    [5, 5, 4]
C          [6]
Name: b, dtype: object

In [3]: df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
        df1
Out[3]: 
   a        new
0  A     [1, 2]
1  B  [5, 5, 4]
2  C        [6]

#authentification #permissions #group

User, group and permission overview

# Check if 'webmasters' group exist
cat /etc/group | grep webmasters

# Create 'webmasters' group
sudo addgroup webmasters

# Add users to 'webmasters' group
sudo usermod -a -G webmasters username

# INFO: Group assignment changes won't take effect until the users log out and back in.

# Change group owner of the directory to webmaster user
sudo chgrp -R webmasters /etc/nginx/
  
# Give write permission to the group
sudo chmod -R g+w /etc/nginx/
  
# Create file as different user
sudo -u username touch /etc/nginx/test.txt


# When using a number mask for permission representation there are only a few basic permissions

4: Read
2: Write
1: Execute

# Combined you get this table

+-----+---+--------------------------+
| rwx | 7 | read    write   execute  |
| rw- | 6 | read    write            |
| r-x | 5 | read            execute  |
| r-- | 4 | read                     |
| -wx | 3 |         write   execute  |
| -w- | 2 |         write            |
| --x | 1 |                 execute  |
| --- | 0 |                          |
+------------------------------------+
  
# The permissions for user, group and other are listet after each other when looking them up

+------------+------+-------+
| Permission | Octal| Field |
+------------+------+-------+
| rwx------  | 700  | User  |
| ---rwx---  | 070  | Group |
| ------rwx  | 007  | Other |
+------------+------+-------+
  
# This boils down to this
  
+------------------------+-----------+--------------------------------------+
| chmod u=rwx,g=rwx,o=rx | chmod 775 | For world readable directories		|
|                        |           |   Members of group can change files	|
| chmod u=rwx,g=rx,o=    | chmod 750 | For group readable directories		|
|                        |           |   Members of group can change files	|
| chmod u=rwx,go=        | chmod 700 | For private direcories				|
+------------------------+-----------+--------------------------------------+

Train Fasttext - GloVe algorithm on my own corpus

group by the first column and get second column as lists in rows:

User, group and permission overview

Save snippets that work with our extensions