There are many resources dealing with how to create word embeddings but it’s in general quite an undertaking in terms of resources. You need serious GPU power and datasets to make it happen (in a timely fashion at least). Google has pretrained models on huge collections of newgroup messages but it not as easy as the GloVe approach. GloVe’s pretrained models are based on Wikipedia, common crawls or Twitter and are more manageable than Google’s. If you are interested in simple experiments with embeddings it’s a great stepping stone.

Below you can find such a simple experiment. Given a word and a threshold it returns words in the vicinity of the given one.

Note that is a different things than synonyms or the approach taken by Microsoft with concept graphs. See the article Microsoft Concept Graph in Neo4j , for example. Ultimately someone will come up with a unified approach at some point.

import numpy as np import os np.seterr(divide='ignore', invalid='ignore') class Glove(object): def __init__(self, file="small"): vectors_file = os.path.dirname(os.path.abspath(__file__)) + "/data/Wikipedia/glove.6B.50d.txt" with open(vectors_file, 'r') as f: vectors = {} words = {} idx = 0 for line in f: vals = line.rstrip().split(' ') words[vals[0]] = idx vectors[vals[0]] = [float(x) for x in vals[1:]] idx += 1 vocab_size = len(words) vocab = {w: idx for idx, w in enumerate(words)} ivocab = {idx: w for idx, w in enumerate(words)} vector_dim = len(vectors[ivocab[0]]) W = np.zeros((vocab_size, vector_dim)) for word, v in vectors.items(): if word == '<unk>': continue W[vocab[word], :] = v # normalize each word vector to unit variance W_norm = np.zeros(W.shape) d = (np.sum(W ** 2, 1) ** (0.5)) W_norm = (W.T / d).T self.W = W_norm # word gives index self.vocab = vocab # index gives word self.ivocab = ivocab def find_similar_words(self, term, thresh): if term in self.vocab: vec_term = self.W[self.vocab[term], :] found = [] cosines = np.dot(self.W, vec_term.T) indices = [i for i, v in enumerate(cosines) if v > thresh] return {self.ivocab[k]: cosines[k] for k in indices} else: return [] from unittest import TestCase import Glove as glove class Data(TestCase): def test_createsClassificationData(self): g = glove.Glove() found = g.find_similar_words("genius", 0.75) print(found)# Testing started at 5:41 PM ...#{'imagination': 0.77246037977378146}