Train Fasttext - GloVe algorithm on my own corpus

PHOTO EMBED

Thu Apr 21 2022 17:21:57 GMT+0000 (Coordinated Universal Time)

Saved by @QuinnFox12 #pandas #list #group

#You can do it using GloVe library:

#Install it: 

!pip install glove_python

from glove import Corpus, Glove

#Creating a corpus object
corpus = Corpus() 

#Training the corpus to generate the co-occurrence matrix which is used in GloVe
corpus.fit(lines, window=10)

glove = Glove(no_components=5, learning_rate=0.05) 
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
 Save

 
 
 #for Fasttext
 from gensim.models import FastText
from gensim.test.utils import common_texts  # some example sentences
>>>
print(common_texts[0])
['human', 'interface', 'computer']
print(len(common_texts))
9
model = FastText(vector_size=4, window=3, min_count=1)  # instantiate
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)  # train
model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)

import numpy as np
>>>
np.allclose(model.wv['computer'], model2.wv['computer'])
True


from gensim.test.utils import datapath
>>>
corpus_file = datapath('lee_background.cor')  # absolute path to corpus
model3 = FastText(vector_size=4, window=3, min_count=1)
model3.build_vocab(corpus_file=corpus_file)  # scan over corpus to build the vocabulary
>>>
total_words = model3.corpus_total_words  # number of words in the corpus
model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)


from gensim.utils import tokenize
from gensim import utils
>>>
>>>
class MyIter:
    def __iter__(self):
        path = datapath('crime-and-punishment.txt')
        with utils.open(path, 'r', encoding='utf-8') as fin:
            for line in fin:
                yield list(tokenize(line))
>>>
>>>
model4 = FastText(vector_size=4, window=3, min_count=1)
model4.build_vocab(sentences=MyIter())
total_examples = model4.corpus_count
model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
from gensim.test.utils import get_tmpfile
>>>
fname = get_tmpfile("fasttext.model")
>>>
model.save(fname)
model = FastText.load(fname)


# https://radimrehurek.com/gensim/models/fasttext.html
content_copyCOPY

https://medium.com/analytics-vidhya/word-vectorization-using-glove-76919685ee0b