GloVe - Global Vectors for Word Representation

GloVe is an unsupervised learning algorithm for obtaining vector representations for words.

Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.

Playing with GloVe

import matplotlib as plt
import numpy as np
from sklearn.manifold import TSNE

Get word vectors (tranined on Wikipedia) from gensim

  • 376MB size file
import gensim.downloader as dn
gensim_wiki_model = dn.load('glove-wiki-gigaword-300')
## 300 is dimensions

Let us find the vector representation of a given word

def get_vec_rep(word : str) :
  return gensim_wiki_model[word]

def get_most_similar(word : str) :
  return gensim_wiki_model.most_similar(word)

print(get_vec_rep('green'))

[ 9.7111e-02 -3.9549e-01  5.0061e-01 -2.6536e-01  8.1473e-02 -5.1845e-01
  2.4072e-01  3.1200e-01  2.8080e-02 -6.8087e-01  3.8081e-01 -2.0683e-01
 -2.0663e-01  4.7282e-01  3.9394e-01  2.7941e-01 -7.5484e-01  1.4609e-01
 -4.7726e-01  4.5302e-01 -2.0524e-01  1.6755e-01 -1.8848e-01  2.1746e-01
  9.6432e-02 -6.8901e-01 -8.8415e-02  2.9760e-01 -2.1951e-01  1.2810e-02
 -1.7955e-03 -2.5013e-03 -2.7744e-01  3.7136e-01 -9.8262e-01  6.8767e-01
  2.6734e-01 -6.3868e-01 -3.1059e-01 -5.6088e-01 -1.4389e-02  1.8422e-01
  ...
]

print(get_most_similar('king'))


[
 ('queen', 0.6336469054222107),
 ('prince', 0.619662344455719), 
 ('monarch', 0.5899620652198792), 
 ('kingdom', 0.5791267156600952), 
 ('throne', 0.5606487989425659), 
 ('ii', 0.5562329888343811), 
 ('iii', 0.5503199100494385), 
 ('crown', 0.5224862694740295), 
 ('reign', 0.521735429763794), 
 ('kings', 0.5066401362419128)
 ]

  • What is king - boy + girl?

print(gensim_wiki_model.most_similar(positive=['king', 'girl'], negative=['boy'], topn=2))
 

[('queen', 0.6850624680519104), 
 ('monarch', 0.5474708676338196)
]


Plotting


## Plotting

vocab = ['apple', 'mango', 'sitar', 'violin', 'piano', 'pear', 'jackfruit', 'drums','beach', 'mountain', 'cloud' , 'laptop', 'minicomputer']
# TSNE T-distributed Stochastic Neighbor Embedding.
"""

t-SNE [1] is a tool to visualize high-dimensional data.

Refer: https://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html


"""

def tsne_plot(model):
    labels = []
    wordvecs = []

    for word in vocab:
        wordvecs.append(model[word]) # add the wordvecs for the 'word'
        labels.append(word)
    
    tsne_model = TSNE(perplexity=3, n_components=2, init='pca', random_state=42)
    # n_components  Dimension of the embedded space.
    # random_state : determines the random number generator

    coordinates = tsne_model.fit_transform(wordvecs)

    # prepare 2-d data (x,y)
    x = []
    y = []
    for value in coordinates:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(12,12)) # 12 inches x 12 inches
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(2, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.grid()
    plt.show()

tsne_plot(gensim_wiki_model)


tsne

References