SNAP Library 4.1, Developer Reference  2018-07-26 16:30:42
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
word2vec.cpp
Go to the documentation of this file.
1 #include "stdafx.h"
2 #include "Snap.h"
3 #include "word2vec.h"
4 
5 //Code from https://github.com/nicholas-leonard/word2vec/blob/master/word2vec.c
6 //Customized for SNAP and node2vec
7 
8 void LearnVocab(TVVec<TInt, int64>& WalksVV, TIntV& Vocab) {
9  for( int64 i = 0; i < Vocab.Len(); i++) { Vocab[i] = 0; }
10  for( int64 i = 0; i < WalksVV.GetXDim(); i++) {
11  for( int j = 0; j < WalksVV.GetYDim(); j++) {
12  Vocab[WalksVV(i,j)]++;
13  }
14  }
15 }
16 
17 //Precompute unigram table using alias sampling method
18 void InitUnigramTable(TIntV& Vocab, TIntV& KTable, TFltV& UTable) {
19  double TrainWordsPow = 0;
20  double Pwr = 0.75;
21  TFltV ProbV(Vocab.Len());
22  for (int64 i = 0; i < Vocab.Len(); i++) {
23  ProbV[i]=TMath::Power(Vocab[i],Pwr);
24  TrainWordsPow += ProbV[i];
25  KTable[i]=0;
26  UTable[i]=0;
27  }
28  for (int64 i = 0; i < ProbV.Len(); i++) {
29  ProbV[i] /= TrainWordsPow;
30  }
31  TIntV UnderV;
32  TIntV OverV;
33  for (int64 i = 0; i < ProbV.Len(); i++) {
34  UTable[i] = ProbV[i] * ProbV.Len();
35  if ( UTable[i] < 1 ) {
36  UnderV.Add(i);
37  } else {
38  OverV.Add(i);
39  }
40  }
41  while(UnderV.Len() > 0 && OverV.Len() > 0) {
42  int64 Small = UnderV.Last();
43  int64 Large = OverV.Last();
44  UnderV.DelLast();
45  OverV.DelLast();
46  KTable[Small] = Large;
47  UTable[Large] = (UTable[Large] + UTable[Small]) - 1;
48  if (UTable[Large] < 1) {
49  UnderV.Add(Large);
50  } else {
51  OverV.Add(Large);
52  }
53  }
54  while(UnderV.Len() > 0){
55  int64 curr = UnderV.Last();
56  UnderV.DelLast();
57  UTable[curr]=1;
58  }
59  while(OverV.Len() > 0){
60  int64 curr = OverV.Last();
61  OverV.DelLast();
62  UTable[curr]=1;
63  }
64 }
65 
66 int64 RndUnigramInt(TIntV& KTable, TFltV& UTable, TRnd& Rnd) {
67  TInt X = KTable[static_cast<int64>(Rnd.GetUniDev()*KTable.Len())];
68  double Y = Rnd.GetUniDev();
69  return Y < UTable[X] ? X : KTable[X];
70 }
71 
72 //Initialize negative embeddings
73 void InitNegEmb(TIntV& Vocab, const int& Dimensions, TVVec<TFlt, int64>& SynNeg) {
74  SynNeg = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
75  for (int64 i = 0; i < SynNeg.GetXDim(); i++) {
76  for (int j = 0; j < SynNeg.GetYDim(); j++) {
77  SynNeg(i,j) = 0;
78  }
79  }
80 }
81 
82 //Initialize positive embeddings
83 void InitPosEmb(TIntV& Vocab, const int& Dimensions, TRnd& Rnd, TVVec<TFlt, int64>& SynPos) {
84  SynPos = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
85  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
86  for (int j = 0; j < SynPos.GetYDim(); j++) {
87  SynPos(i,j) =(Rnd.GetUniDev()-0.5)/Dimensions;
88  }
89  }
90 }
91 
92 void TrainModel(TVVec<TInt, int64>& WalksVV, const int& Dimensions,
93  const int& WinSize, const int& Iter, const bool& Verbose,
94  TIntV& KTable, TFltV& UTable, int64& WordCntAll, TFltV& ExpTable,
95  double& Alpha, int64 CurrWalk, TRnd& Rnd,
96  TVVec<TFlt, int64>& SynNeg, TVVec<TFlt, int64>& SynPos) {
97  TFltV Neu1V(Dimensions);
98  TFltV Neu1eV(Dimensions);
99  int64 AllWords = WalksVV.GetXDim()*WalksVV.GetYDim();
100  TIntV WalkV(WalksVV.GetYDim());
101  for (int j = 0; j < WalksVV.GetYDim(); j++) { WalkV[j] = WalksVV(CurrWalk,j); }
102  for (int64 WordI=0; WordI<WalkV.Len(); WordI++) {
103  if ( WordCntAll%10000 == 0 ) {
104  if ( Verbose ) {
105  printf("\rLearning Progress: %.2lf%% ",(double)WordCntAll*100/(double)(Iter*AllWords));
106  fflush(stdout);
107  }
108  Alpha = StartAlpha * (1 - WordCntAll / static_cast<double>(Iter * AllWords + 1));
109  if ( Alpha < StartAlpha * 0.0001 ) { Alpha = StartAlpha * 0.0001; }
110  }
111  int64 Word = WalkV[WordI];
112  for (int i = 0; i < Dimensions; i++) {
113  Neu1V[i] = 0;
114  Neu1eV[i] = 0;
115  }
116  int Offset = Rnd.GetUniDevInt() % WinSize;
117  for (int a = Offset; a < WinSize * 2 + 1 - Offset; a++) {
118  if (a == WinSize) { continue; }
119  int64 CurrWordI = WordI - WinSize + a;
120  if (CurrWordI < 0){ continue; }
121  if (CurrWordI >= WalkV.Len()){ continue; }
122  int64 CurrWord = WalkV[CurrWordI];
123  for (int i = 0; i < Dimensions; i++) { Neu1eV[i] = 0; }
124  //negative sampling
125  for (int j = 0; j < NegSamN+1; j++) {
126  int64 Target, Label;
127  if (j == 0) {
128  Target = Word;
129  Label = 1;
130  } else {
131  Target = RndUnigramInt(KTable, UTable, Rnd);
132  if (Target == Word) { continue; }
133  Label = 0;
134  }
135  double Product = 0;
136  for (int i = 0; i < Dimensions; i++) {
137  Product += SynPos(CurrWord,i) * SynNeg(Target,i);
138  }
139  double Grad; //Gradient multiplied by learning rate
140  if (Product > MaxExp) { Grad = (Label - 1) * Alpha; }
141  else if (Product < -MaxExp) { Grad = Label * Alpha; }
142  else {
143  double Exp = ExpTable[static_cast<int>(Product*ExpTablePrecision)+TableSize/2];
144  Grad = (Label - 1 + 1 / (1 + Exp)) * Alpha;
145  }
146  for (int i = 0; i < Dimensions; i++) {
147  Neu1eV[i] += Grad * SynNeg(Target,i);
148  SynNeg(Target,i) += Grad * SynPos(CurrWord,i);
149  }
150  }
151  for (int i = 0; i < Dimensions; i++) {
152  SynPos(CurrWord,i) += Neu1eV[i];
153  }
154  }
155  WordCntAll++;
156  }
157 }
158 
159 
160 void LearnEmbeddings(TVVec<TInt, int64>& WalksVV, const int& Dimensions,
161  const int& WinSize, const int& Iter, const bool& Verbose,
162  TIntFltVH& EmbeddingsHV) {
163  TIntIntH RnmH;
164  TIntIntH RnmBackH;
165  int64 NNodes = 0;
166  //renaming nodes into consecutive numbers
167  for (int i = 0; i < WalksVV.GetXDim(); i++) {
168  for (int64 j = 0; j < WalksVV.GetYDim(); j++) {
169  if ( RnmH.IsKey(WalksVV(i, j)) ) {
170  WalksVV(i, j) = RnmH.GetDat(WalksVV(i, j));
171  } else {
172  RnmH.AddDat(WalksVV(i,j),NNodes);
173  RnmBackH.AddDat(NNodes,WalksVV(i, j));
174  WalksVV(i, j) = NNodes++;
175  }
176  }
177  }
178  TIntV Vocab(NNodes);
179  LearnVocab(WalksVV, Vocab);
180  TIntV KTable(NNodes);
181  TFltV UTable(NNodes);
182  TVVec<TFlt, int64> SynNeg;
183  TVVec<TFlt, int64> SynPos;
184  TRnd Rnd(time(NULL));
185  InitPosEmb(Vocab, Dimensions, Rnd, SynPos);
186  InitNegEmb(Vocab, Dimensions, SynNeg);
187  InitUnigramTable(Vocab, KTable, UTable);
188  TFltV ExpTable(TableSize);
189  double Alpha = StartAlpha; //learning rate
190 #pragma omp parallel for schedule(dynamic)
191  for (int i = 0; i < TableSize; i++ ) {
192  double Value = -MaxExp + static_cast<double>(i) / static_cast<double>(ExpTablePrecision);
193  ExpTable[i] = TMath::Power(TMath::E, Value);
194  }
195  int64 WordCntAll = 0;
196 // op RS 2016/09/26, collapse does not compile on Mac OS X
197 //#pragma omp parallel for schedule(dynamic) collapse(2)
198  for (int j = 0; j < Iter; j++) {
199 #pragma omp parallel for schedule(dynamic)
200  for (int64 i = 0; i < WalksVV.GetXDim(); i++) {
201  TrainModel(WalksVV, Dimensions, WinSize, Iter, Verbose, KTable, UTable,
202  WordCntAll, ExpTable, Alpha, i, Rnd, SynNeg, SynPos);
203  }
204  }
205  if (Verbose) { printf("\n"); fflush(stdout); }
206  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
207  TFltV CurrV(SynPos.GetYDim());
208  for (int j = 0; j < SynPos.GetYDim(); j++) { CurrV[j] = SynPos(i, j); }
209  EmbeddingsHV.AddDat(RnmBackH.GetDat(i), CurrV);
210  }
211 }
Definition: dt.h:11
void InitUnigramTable(TIntV &Vocab, TIntV &KTable, TFltV &UTable)
Definition: word2vec.cpp:18
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
void LearnEmbeddings(TVVec< TInt, int64 > &WalksVV, const int &Dimensions, const int &WinSize, const int &Iter, const bool &Verbose, TIntFltVH &EmbeddingsHV)
Learns embeddings using SGD, Skip-gram with negative sampling.
Definition: word2vec.cpp:160
void InitNegEmb(TIntV &Vocab, const int &Dimensions, TVVec< TFlt, int64 > &SynNeg)
Definition: word2vec.cpp:73
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
const int MaxExp
Definition: word2vec.h:10
Definition: ds.h:2223
TSizeTy GetYDim() const
Definition: ds.h:2251
static double Power(const double &Base, const double &Exponent)
Definition: xmath.h:25
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
Definition: dt.h:1134
void TrainModel(TVVec< TInt, int64 > &WalksVV, const int &Dimensions, const int &WinSize, const int &Iter, const bool &Verbose, TIntV &KTable, TFltV &UTable, int64 &WordCntAll, TFltV &ExpTable, double &Alpha, int64 CurrWalk, TRnd &Rnd, TVVec< TFlt, int64 > &SynNeg, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:92
long long int64
Definition: bd.h:27
const double StartAlpha
Definition: word2vec.h:20
TSizeTy GetXDim() const
Definition: ds.h:2250
Definition: hash.h:97
double GetUniDev()
Definition: dt.h:30
const int ExpTablePrecision
Definition: word2vec.h:13
void LearnVocab(TVVec< TInt, int64 > &WalksVV, TIntV &Vocab)
Definition: word2vec.cpp:8
const int TableSize
Definition: word2vec.h:14
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int64 RndUnigramInt(TIntV &KTable, TFltV &UTable, TRnd &Rnd)
Definition: word2vec.cpp:66
const int NegSamN
Definition: word2vec.h:17
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static double E
Definition: xmath.h:7
void InitPosEmb(TIntV &Vocab, const int &Dimensions, TRnd &Rnd, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:83