SNAP Library 4.0, Developer Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
word2vec.cpp
Go to the documentation of this file.
1 #include "stdafx.h"
2 #include "Snap.h"
3 #include "word2vec.h"
4 
5 //Code from https://github.com/nicholas-leonard/word2vec/blob/master/word2vec.c
6 //Customized for SNAP and node2vec
7 
8 void LearnVocab(TVVec<TInt, int64>& WalksVV, TIntV& Vocab) {
9  for( int64 i = 0; i < Vocab.Len(); i++) { Vocab[i] = 0; }
10  for( int64 i = 0; i < WalksVV.GetXDim(); i++) {
11  for( int j = 0; j < WalksVV.GetYDim(); j++) {
12  Vocab[WalksVV(i,j)]++;
13  }
14  }
15 }
16 
17 //Precompute unigram table using alias sampling method
18 void InitUnigramTable(TIntV& Vocab, TIntV& KTable, TFltV& UTable) {
19  double TrainWordsPow = 0;
20  double Pwr = 0.75;
21  TFltV ProbV(Vocab.Len());
22  for (int64 i = 0; i < Vocab.Len(); i++) {
23  ProbV[i]=TMath::Power(Vocab[i],Pwr);
24  TrainWordsPow += ProbV[i];
25  KTable[i]=0;
26  UTable[i]=0;
27  }
28  for (int64 i = 0; i < ProbV.Len(); i++) {
29  ProbV[i] /= TrainWordsPow;
30  }
31  TIntV UnderV;
32  TIntV OverV;
33  for (int64 i = 0; i < ProbV.Len(); i++) {
34  UTable[i] = ProbV[i] * ProbV.Len();
35  if ( UTable[i] < 1 ) {
36  UnderV.Add(i);
37  } else {
38  OverV.Add(i);
39  }
40  }
41  while(UnderV.Len() > 0 && OverV.Len() > 0) {
42  int64 Small = UnderV.Last();
43  int64 Large = OverV.Last();
44  UnderV.DelLast();
45  OverV.DelLast();
46  KTable[Small] = Large;
47  UTable[Large] = UTable[Large] + UTable[Small] - 1;
48  if (UTable[Large] < 1) {
49  UnderV.Add(Large);
50  } else {
51  OverV.Add(Large);
52  }
53  }
54 }
55 
56 int64 RndUnigramInt(TIntV& KTable, TFltV& UTable, TRnd& Rnd) {
57  TInt X = KTable[static_cast<int64>(Rnd.GetUniDev()*KTable.Len())];
58  double Y = Rnd.GetUniDev();
59  return Y < UTable[X] ? X : KTable[X];
60 }
61 
62 //Initialize negative embeddings
63 void InitNegEmb(TIntV& Vocab, int& Dimensions, TVVec<TFlt, int64>& SynNeg) {
64  SynNeg = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
65  for (int64 i = 0; i < SynNeg.GetXDim(); i++) {
66  for (int j = 0; j < SynNeg.GetYDim(); j++) {
67  SynNeg(i,j) = 0;
68  }
69  }
70 }
71 
72 //Initialize positive embeddings
73 void InitPosEmb(TIntV& Vocab, int& Dimensions, TRnd& Rnd, TVVec<TFlt, int64>& SynPos) {
74  SynPos = TVVec<TFlt, int64>(Vocab.Len(),Dimensions);
75  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
76  for (int j = 0; j < SynPos.GetYDim(); j++) {
77  SynPos(i,j) =(Rnd.GetUniDev()-0.5)/Dimensions;
78  }
79  }
80 }
81 
82 void TrainModel(TVVec<TInt, int64>& WalksVV, int& Dimensions, int& WinSize, int& Iter, bool& Verbose,
83  TIntV& KTable, TFltV& UTable, int64& WordCntAll, TFltV& ExpTable, double& Alpha,
84  int64 CurrWalk, TRnd& Rnd, TVVec<TFlt, int64>& SynNeg, TVVec<TFlt, int64>& SynPos) {
85  TFltV Neu1V(Dimensions);
86  TFltV Neu1eV(Dimensions);
87  int64 AllWords = WalksVV.GetXDim()*WalksVV.GetYDim();
88  TIntV WalkV(WalksVV.GetYDim());
89  for (int j = 0; j < WalksVV.GetYDim(); j++) { WalkV[j] = WalksVV(CurrWalk,j); }
90  for (int64 WordI=0; WordI<WalkV.Len(); WordI++) {
91  if ( WordCntAll%10000 == 0 ) {
92  if ( Verbose ) {
93  printf("\rLearning Progress: %.2lf%% ",(double)WordCntAll*100/(double)(Iter*AllWords));
94  fflush(stdout);
95  }
96  Alpha = StartAlpha * (1 - WordCntAll / static_cast<double>(Iter * AllWords + 1));
97  if ( Alpha < StartAlpha * 0.0001 ) { Alpha = StartAlpha * 0.0001; }
98  }
99  int64 Word = WalkV[WordI];
100  for (int i = 0; i < Dimensions; i++) {
101  Neu1V[i] = 0;
102  Neu1eV[i] = 0;
103  }
104  int Offset = Rnd.GetUniDevInt() % WinSize;
105  for (int a = Offset; a < WinSize * 2 + 1 - Offset; a++) {
106  if (a == WinSize) { continue; }
107  int64 CurrWordI = WordI - WinSize + a;
108  if (CurrWordI < 0){ continue; }
109  if (CurrWordI >= WalkV.Len()){ continue; }
110  int64 CurrWord = WalkV[CurrWordI];
111  for (int i = 0; i < Dimensions; i++) { Neu1eV[i] = 0; }
112  //negative sampling
113  for (int j = 0; j < NegSamN+1; j++) {
114  int64 Target, Label;
115  if (j == 0) {
116  Target = Word;
117  Label = 1;
118  } else {
119  Target = RndUnigramInt(KTable, UTable, Rnd);
120  if (Target == Word) { continue; }
121  Label = 0;
122  }
123  double Product = 0;
124  for (int i = 0; i < Dimensions; i++) {
125  Product += SynPos(CurrWord,i) * SynNeg(Target,i);
126  }
127  double Grad; //Gradient multiplied by learning rate
128  if (Product > MaxExp) { Grad = (Label - 1) * Alpha; }
129  else if (Product < -MaxExp) { Grad = Label * Alpha; }
130  else {
131  double Exp = ExpTable[static_cast<int>(Product*ExpTablePrecision)+TableSize/2];
132  Grad = (Label - 1 + 1 / (1 + Exp)) * Alpha;
133  }
134  for (int i = 0; i < Dimensions; i++) {
135  Neu1eV[i] += Grad * SynNeg(Target,i);
136  SynNeg(Target,i) += Grad * SynPos(CurrWord,i);
137  }
138  }
139  for (int i = 0; i < Dimensions; i++) {
140  SynPos(CurrWord,i) += Neu1eV[i];
141  }
142  }
143  WordCntAll++;
144  }
145 }
146 
147 
148 void LearnEmbeddings(TVVec<TInt, int64>& WalksVV, int& Dimensions, int& WinSize,
149  int& Iter, bool& Verbose, TIntFltVH& EmbeddingsHV) {
150  TIntIntH RnmH;
151  TIntIntH RnmBackH;
152  int64 NNodes = 0;
153  //renaming nodes into consecutive numbers
154  for (int i = 0; i < WalksVV.GetXDim(); i++) {
155  for (int64 j = 0; j < WalksVV.GetYDim(); j++) {
156  if ( RnmH.IsKey(WalksVV(i, j)) ) {
157  WalksVV(i, j) = RnmH.GetDat(WalksVV(i, j));
158  } else {
159  RnmH.AddDat(WalksVV(i,j),NNodes);
160  RnmBackH.AddDat(NNodes,WalksVV(i, j));
161  WalksVV(i, j) = NNodes++;
162  }
163  }
164  }
165  TIntV Vocab(NNodes);
166  LearnVocab(WalksVV, Vocab);
167  TIntV KTable(NNodes);
168  TFltV UTable(NNodes);
169  TVVec<TFlt, int64> SynNeg;
170  TVVec<TFlt, int64> SynPos;
171  TRnd Rnd(time(NULL));
172  InitPosEmb(Vocab, Dimensions, Rnd, SynPos);
173  InitNegEmb(Vocab, Dimensions, SynNeg);
174  InitUnigramTable(Vocab, KTable, UTable);
175  TFltV ExpTable(TableSize);
176  double Alpha = StartAlpha; //learning rate
177 #pragma omp parallel for schedule(dynamic)
178  for (int i = 0; i < TableSize; i++ ) {
179  double Value = -MaxExp + static_cast<double>(i) / static_cast<double>(ExpTablePrecision);
180  ExpTable[i] = TMath::Power(TMath::E, Value);
181  }
182  int64 WordCntAll = 0;
183 // op RS 2016/09/26, collapse does not compile on Mac OS X
184 //#pragma omp parallel for schedule(dynamic) collapse(2)
185  for (int j = 0; j < Iter; j++) {
186 #pragma omp parallel for schedule(dynamic)
187  for (int64 i = 0; i < WalksVV.GetXDim(); i++) {
188  TrainModel(WalksVV, Dimensions, WinSize, Iter, Verbose, KTable, UTable,
189  WordCntAll, ExpTable, Alpha, i, Rnd, SynNeg, SynPos);
190  }
191  }
192  if (Verbose) { printf("\n"); fflush(stdout); }
193  for (int64 i = 0; i < SynPos.GetXDim(); i++) {
194  TFltV CurrV(SynPos.GetYDim());
195  for (int j = 0; j < SynPos.GetYDim(); j++) { CurrV[j] = SynPos(i, j); }
196  EmbeddingsHV.AddDat(RnmBackH.GetDat(i), CurrV);
197  }
198 }
Definition: dt.h:11
void InitPosEmb(TIntV &Vocab, int &Dimensions, TRnd &Rnd, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:73
void InitUnigramTable(TIntV &Vocab, TIntV &KTable, TFltV &UTable)
Definition: word2vec.cpp:18
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
const int MaxExp
Definition: word2vec.h:9
Definition: ds.h:2222
TSizeTy GetYDim() const
Definition: ds.h:2250
static double Power(const double &Base, const double &Exponent)
Definition: xmath.h:25
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
Definition: dt.h:1134
void TrainModel(TVVec< TInt, int64 > &WalksVV, int &Dimensions, int &WinSize, int &Iter, bool &Verbose, TIntV &KTable, TFltV &UTable, int64 &WordCntAll, TFltV &ExpTable, double &Alpha, int64 CurrWalk, TRnd &Rnd, TVVec< TFlt, int64 > &SynNeg, TVVec< TFlt, int64 > &SynPos)
Definition: word2vec.cpp:82
void LearnEmbeddings(TVVec< TInt, int64 > &WalksVV, int &Dimensions, int &WinSize, int &Iter, bool &Verbose, TIntFltVH &EmbeddingsHV)
Learns embeddings using SGD, Skip-gram with negative sampling.
Definition: word2vec.cpp:148
void InitNegEmb(TIntV &Vocab, int &Dimensions, TVVec< TFlt, int64 > &SynNeg)
Definition: word2vec.cpp:63
long long int64
Definition: bd.h:27
const double StartAlpha
Definition: word2vec.h:19
TSizeTy GetXDim() const
Definition: ds.h:2249
Definition: hash.h:97
double GetUniDev()
Definition: dt.h:30
const int ExpTablePrecision
Definition: word2vec.h:12
void LearnVocab(TVVec< TInt, int64 > &WalksVV, TIntV &Vocab)
Definition: word2vec.cpp:8
const int TableSize
Definition: word2vec.h:13
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int64 RndUnigramInt(TIntV &KTable, TFltV &UTable, TRnd &Rnd)
Definition: word2vec.cpp:56
const int NegSamN
Definition: word2vec.h:16
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static double E
Definition: xmath.h:7