SNAP Library 4.0, Developer Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
circles.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include "stdafx.h"
4 
6 public:
11  TGraphAttributes(PUNGraph G, const char* nodeFeaturePath, const char* groundtruthPath);
13  }
14 
17 
22 
23  TVec<TIntSet> GroundTruth; // Groundtruth communities
24 };
25 
27 
28 class TCluster {
29 public:
36  GraphAttributes(GraphAttributes), K(K), Lambda(Lambda) {
37  Theta = new TFlt[K * GraphAttributes->NFeatures];
38  Derivative = new TFlt[K * GraphAttributes->NFeatures];
39  for (int k = 0; k < K; k++) {
40  for (int f = 0; f < GraphAttributes->NFeatures; f++) {
41  Theta[k * GraphAttributes->NFeatures + f] = 0;
42  Derivative[k * GraphAttributes->NFeatures + f] = 0;
43  }
44 
45  // Clusters are initially empty.
46  CHat.Add(TIntSet());
47  }
48  }
50  delete[] Theta;
51  delete[] Derivative;
52  }
53 
59  void Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps);
60 
65  return CHat;
66  }
68 
69 private:
70  TFlt* Theta; // Community parameters
71  TFlt* Derivative; // Partial derivatives
72 
73  TVec<TIntSet> CHat; // Latent community memberships
74  PGraphAttributes GraphAttributes; // Graph with attributes
75 
78 
83 
89  TIntSet MCMC(TInt k, TInt MCMCReps);
90  void Gradient();
91 };
92 
94 
96 {
97  zeroOne = 0,
99  fScore = 2
100 };
101 
103 TFlt Loss(TIntSet& l, TIntSet lHat, int N, int Which)
104 {
105  if (l.Len() == 0) {
106  if (lHat.Len() == 0) {
107  return 0;
108  }
109  return 1.0;
110  }
111  if (lHat.Len() == 0) {
112  if (l.Len() == 0) {
113  return 0;
114  }
115  return 1.0;
116  }
117  TInt TruePositives = 0;
118  TInt FalsePositives = 0;
119  TInt FalseNegatives = 0;
120 
121  TFlt LabelLoss = 0;
122  for (THashSetKeyI<TInt> it = l.BegI(); it != l.EndI(); it ++) {
123  int c = it.GetKey();
124  if (!lHat.IsKey(c)) {
125  // false negative
126  FalseNegatives ++;
127  if (Which == zeroOne) {
128  LabelLoss += 1.0/N;
129  }
130  else if (Which == balancedError) {
131  LabelLoss += 0.5/l.Len();
132  }
133  }
134  }
135 
136  for (THashSetKeyI<TInt> it = lHat.BegI(); it != lHat.EndI(); it ++) {
137  int c = it.GetKey();
138  if (!l.IsKey(c)) {
139  // false positive
140  FalsePositives ++;
141  if (Which == zeroOne) {
142  LabelLoss += 1.0/N;
143  }
144  else if (Which == balancedError) {
145  LabelLoss += 0.5/(N - l.Len());
146  }
147  }
148  else {
149  TruePositives ++;
150  }
151  }
152 
153  if ((lHat.Len() == 0 || TruePositives == 0) && Which == fScore) {
154  return 1.0;
155  }
156  TFlt precision = (1.0*TruePositives)/lHat.Len();
157  TFlt recall = (1.0*TruePositives)/l.Len();
158  if (Which == fScore) {
159  return 1 - 2 * (precision*recall) / (precision + recall);
160  }
161 
162  return LabelLoss;
163 }
164 
165 /*
167 // Requires code for the munkres algorithm, available at https://github.com/saebyn/munkres-cpp
168 TFlt TotalLoss(TVec<TIntSet>& Clusters, TVec<TIntSet>& CHat, int N, int Which)
169 {
170  Matrix<double> matrix(Clusters.Len(), CHat.Len());
171 
172  for (int i = 0; i < (int) Clusters.Len(); i ++) {
173  for (int j = 0; j < (int) CHat.Len(); j ++) {
174  matrix(i,j) = Loss(Clusters[i], CHat[j], N, Which);
175  }
176  }
177 
178  Munkres m;
179  m.solve(matrix);
180 
181  TFlt l = 0;
182  for (int i = 0; i < (int) Clusters.Len(); i ++) {
183  for (int j = 0; j < (int) CHat.Len(); j ++) {
184  if (matrix(i,j) == 0) {
185  l += Loss(Clusters[i], CHat[j], N, Which);
186  }
187  }
188  }
189 
190  return l / (Clusters.Len() < CHat.Len() ? Clusters.Len() : CHat.Len());
191 }
192 */
193 
194 TGraphAttributes::TGraphAttributes(PUNGraph G, const char* NodeFeaturePath,
195  const char* GroundTruthPath) :
196  G(G) {
197  // Read node Features for the graph
198  FILE* f = fopen(NodeFeaturePath, "r");
199  int NNodes;
200  int nF;
201  fscanf(f, "%d %d", &NNodes, &nF);
202  NFeatures = nF;
203 
204  for (int i = 0; i < NNodes; i++) {
205  int nid;
206  fscanf(f, "%d", &nid);
207 
208  if (!G->IsNode(nid)) {
209  printf("Warning: %d is not a node in G.\n", nid);
210  }
211  TInt kv = NodeFeatures.AddKey(nid);
212  for (int x = 0; x < nF; x++) {
213  int z = 0;
214  fscanf(f, "%d", &z);
215  if (z) {
216  NodeFeatures[kv].AddDat(x) = z;
217  }
218  }
219  if (G->IsNode(nid)) {
220  NodeIDs.Add(nid);
221  }
222  }
223  fclose(f);
224 
225  f = fopen(GroundTruthPath, "r");
226  if (f == NULL) {
227  printf("Groundtruth file %s not found.\n", GroundTruthPath);
228  }
229  else {
230  char* CircleName = new char [1000];
231  while (fscanf(f, "%s", CircleName) == 1)
232  {
233  TIntSet Circle;
234  while (true) {
235  int nid;
236  fscanf(f, "%d", &nid);
237  Circle.AddKey(nid);
238  char c;
239  while (true) {
240  c = fgetc(f);
241  if (c == '\n') break;
242  if (c >= '0' && c <= '9') {
243  fseek(f, -1, SEEK_CUR);
244  break;
245  }
246  }
247  if (c == '\n') break;
248  }
249  GroundTruth.Add(Circle);
250  }
251  delete [] CircleName;
252  }
253  fclose(f);
254 
255  // Construct the Features encoding the difference between node attributes.
256  for (int i = 0; i < NodeIDs.Len(); i++) {
257  TInt ni = NodeIDs[i];
258  for (int j = i + 1; j < NodeIDs.Len(); j++) {
259  TInt nj = NodeIDs[j];
260  TInt kv = EdgeFeatures.AddKey(TIntPr(ni, nj));
261  for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(ni).BegI();
262  !it.IsEnd(); it++) {
263  TInt k = it.GetKey();
264  TInt diff = 0;
265  if (NodeFeatures.GetDat(nj).IsKey(k)) {
266  diff = abs(it.GetDat() - NodeFeatures.GetDat(nj).GetDat(k));
267  } else {
268  diff = abs(it.GetDat());
269  }
270  if (diff) {
271  EdgeFeatures[kv].AddDat(k) = diff;
272  }
273  }
274  for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(nj).BegI();
275  !it.IsEnd(); it++) {
276  TInt k = it.GetKey();
277  TInt diff = 0;
278  if (NodeFeatures.GetDat(ni).IsKey(k)) {
279  diff = abs(it.GetDat() - NodeFeatures.GetDat(ni).GetDat(k));
280  } else {
281  diff = abs(it.GetDat());
282  }
283  if (diff) {
284  EdgeFeatures[kv].AddDat(k) = diff;
285  }
286  }
287  }
288  }
289 }
290 
292 void TCluster::Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps) {
293  // Learning rate
294  TFlt Increment = 1.0 / (1.0 * GraphAttributes->NodeIDs.Len() * GraphAttributes->NodeIDs.Len());
295  TRnd t;
296 
297  for (int OuterRep = 0; OuterRep < OuterReps; OuterRep++) {
298  // If it's the first iteration or the solution is degenerate,
299  // randomly initialize the weights and Clusters
300  for (int k = 0; k < K; k++) {
301  if (OuterRep == 0 || CHat[k].Empty() || CHat[k].Len()
302  == GraphAttributes->NodeIDs.Len()) {
303  CHat[k].Clr();
304  for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
305  if (t.GetUniDevInt(2) == 0) {
306  CHat[k].AddKey(GraphAttributes->NodeIDs[i]);
307  }
308  }
309  for (int i = 0; i < GraphAttributes->NFeatures; i++) {
310  Theta[k * GraphAttributes->NFeatures + i] = 0;
311  }
312  // Just set a single Feature to 1 as a random initialization.
314  Theta[k * GraphAttributes->NFeatures] = 1;
315  }
316  }
317 
318  for (int k = 0; k < K; k++) {
319  CHat[k] = MCMC(k, MCMCReps);
320  }
321  TFlt llPrevious = LogLikelihood();
322 
323  // Perform gradient ascent
324  TFlt ll = 0;
325  for (int gradientRep = 0; gradientRep < GradientReps; gradientRep++) {
326  Gradient();
327  for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
328  Theta[i] += Increment * Derivative[i];
329  }
330  printf(".");
331  fflush( stdout);
332  ll = LogLikelihood();
333 
334  // If we reduced the objective, undo the update and stop.
335  if (ll < llPrevious) {
336  for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
337  Theta[i] -= Increment * Derivative[i];
338  }
339  ll = llPrevious;
340  break;
341  }
342  llPrevious = ll;
343  }
344  printf("\nIteration %d, ll = %f\n", OuterRep + 1, (double) ll);
345  }
346 }
347 
349 TFlt Inner(TIntIntH& Feature, TFlt* Parameter) {
350  TFlt res = 0;
351  for (THashKeyDatI<TInt, TInt> it = Feature.BegI(); !it.IsEnd(); it++) {
352  res += it.GetDat() * Parameter[it.GetKey()];
353  }
354  return res;
355 }
356 
359  TRnd t;
360 
361  THash<TInt, TFlt> CostNotIncludeHash;
362  THash<TInt, TFlt> CostIncludeHash;
363 
364  TVec<TInt> NewLabel;
365  int csize = 0;
366  for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
367  if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) {
368  NewLabel.Add(0);
369  } else {
370  NewLabel.Add(1);
371  }
372  if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) {
373  csize++;
374  }
375  }
376  // Compute edge log-probabilities.
378  !it.IsEnd(); it++) {
379  TIntPr e = it.GetKey();
380  TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(e);
381  TInt Src = e.Val1;
382  TInt Dst = e.Val2;
383 
384  TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
385  TFlt InnerProduct = Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
386  TFlt Other = 0;
387  for (int l = 0; l < K; l++) {
388  if (l == k) {
389  continue;
390  }
391  TFlt d = (CHat[l].IsKey(Src) && CHat[l].IsKey(Dst)) ? 1 : -1;
392  Other += d * Inner(it.GetDat(), Theta + l * GraphAttributes->NFeatures);
393  }
394 
395  TFlt CostNotInclude;
396  TFlt CostInclude;
397 
398  if (Exists) {
399  CostNotInclude = -Other + InnerProduct + log(1 + exp(Other - InnerProduct));
400  CostInclude = -Other - InnerProduct + log(1 + exp(Other + InnerProduct));
401  } else {
402  CostNotInclude = log(1 + exp(Other - InnerProduct));
403  CostInclude = log(1 + exp(Other + InnerProduct));
404  }
405 
406  CostNotIncludeHash.AddDat(kv) = -CostNotInclude;
407  CostIncludeHash.AddDat(kv) = -CostInclude;
408  }
409 
410  // Run MCMC using precomputed probabilities
411  TFlt InitialTemperature = 1.0; // Initial temperature
412  for (int r = 2; r < MCMCReps + 2; r++) {
413  TFlt Temperature = InitialTemperature / log((double) r);
414  for (int n = 0; n < GraphAttributes->NodeIDs.Len(); n++) {
415  TFlt l0 = 0;
416  TFlt l1 = 0;
417  for (int np = 0; np < GraphAttributes->NodeIDs.Len(); np++) {
418  if (n == np) {
419  continue;
420  }
422  if (ed.Val1 > ed.Val2) {
423  ed = TIntPr(ed.Val2, ed.Val1);
424  }
425  TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(ed);
426  TFlt m0 = CostNotIncludeHash.GetDat(kv);
427  if (NewLabel[np] == 0) {
428  l0 += m0;
429  l1 += m0;
430  } else {
431  l0 += m0;
432  l1 += CostIncludeHash.GetDat(kv);
433  }
434  }
435  TFlt LogLikelihoodDiff = exp(l1 - l0);
436  TFlt AcceptProb = pow(LogLikelihoodDiff, 1.0 / Temperature);
437  if (t.GetUniDev() < AcceptProb) {
438  NewLabel[n] = 1;
439  } else {
440  NewLabel[n] = 0;
441  }
442  }
443  }
444 
445  TIntSet Result;
446  for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
447  if (NewLabel[i]) {
448  Result.AddKey(GraphAttributes->NodeIDs[i]);
449  }
450  }
451 
452  return Result;
453 }
454 
456 void TCluster::Gradient(void) {
457  for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
458  if (Theta[i] > 0) {
459  Derivative[i] = -Lambda * Theta[i];
460  } else {
461  Derivative[i] = Lambda * Theta[i];
462  }
463  }
464 
466  !it.IsEnd(); it++) {
467  TFlt InnerProduct = 0;
468  TIntPr Edge = it.GetKey();
469  TInt Src = Edge.Val1;
470  TInt Dst = Edge.Val2;
471  TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
472  for (int k = 0; k < K; k++) {
473  TFlt d = CHat[k].IsKey(Src) && CHat[k].IsKey(Dst) ? 1 : -1;
474  InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
475  }
476  TFlt expinp = exp(InnerProduct);
477  TFlt q = expinp / (1 + expinp);
478  if (q != q) {
479  q = 1; // Test for nan in case of overflow.
480  }
481 
482  for (int k = 0; k < K; k++) {
483  TBool d_ = CHat[k].IsKey(Src) && CHat[k].IsKey(Dst);
484  TFlt d = d_ ? 1 : -1;
485  for (THashKeyDatI<TInt, TInt> itf = it.GetDat().BegI();
486  !itf.IsEnd(); itf++) {
487  TInt i = itf.GetKey();
488  TInt f = itf.GetDat();
489  if (Exists) {
490  Derivative[k * GraphAttributes->NFeatures + i] += d * f;
491  }
492  Derivative[k * GraphAttributes->NFeatures + i] += -d * f * q;
493  }
494  }
495  }
496 }
497 
500  TFlt ll = 0;
502  !it.IsEnd(); it++) {
503  TFlt InnerProduct = 0;
504  TIntPr Edge = it.GetKey();
505  TInt Src = Edge.Val1;
506  TInt Dst = Edge.Val2;
507  TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
508 
509  for (int k = 0; k < K; k++) {
510  TFlt d = CHat[k].IsKey(Src) && CHat[k].IsKey(Dst) ? 1 : -1;
511  InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
512  }
513  if (Exists) {
514  ll += InnerProduct;
515  }
516  TFlt ll_ = log(1 + exp(InnerProduct));
517  ll += -ll_;
518  }
519 
520  if (ll != ll) {
521  printf("ll isnan\n");
522  exit(1);
523  }
524  return ll;
525 }
Definition: bd.h:440
TPt< TCluster > PCluster
Definition: circles.h:93
TFlt Lambda
Definition: circles.h:77
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
lossType
Definition: circles.h:95
TIntSet MCMC(TInt k, TInt MCMCReps)
Optimize the cluster assignments for the k'th cluster.
Definition: circles.h:358
TVec< TIntSet > CHat
Definition: circles.h:73
Definition: dt.h:11
#define SEEK_CUR
Definition: fl.cpp:968
TIter BegI() const
Definition: shash.h:1105
TIter BegI() const
Definition: hash.h:213
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TCluster(PGraphAttributes GraphAttributes, TInt K, TFlt Lambda)
Definition: circles.h:35
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
TVec< TIntSet > GetCircles(void)
Definition: circles.h:64
bool IsKey(const TKey &Key) const
Definition: shash.h:1148
TPt< TGraphAttributes > PGraphAttributes
Definition: circles.h:26
Definition: dt.h:1383
Definition: circles.h:99
void Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps)
Train the model to predict K Clusters.
Definition: circles.h:292
THash< TInt, TIntIntH > NodeFeatures
Definition: circles.h:18
const TDat & GetDat() const
Definition: hash.h:81
bool IsEnd() const
Tests whether the iterator is pointing to the past-end element.
Definition: hash.h:78
THash< TIntPr, TIntIntH > EdgeFeatures
Definition: circles.h:19
TGraphAttributes(PUNGraph G, const char *nodeFeaturePath, const char *groundtruthPath)
Definition: circles.h:194
int AddKey(const TKey &Key)
Definition: shash.h:1254
TVec< TIntSet > GroundTruth
Definition: circles.h:23
Definition: dt.h:1134
TInt NFeatures
Definition: circles.h:16
~TCluster()
Definition: circles.h:49
TFlt * Derivative
Definition: circles.h:71
PUNGraph G
Definition: circles.h:15
TIter EndI() const
Definition: shash.h:1112
int Len() const
Definition: shash.h:1121
Definition: ds.h:32
TFlt Inner(TIntIntH &Feature, TFlt *Parameter)
Inner product for sparse features.
Definition: circles.h:349
Definition: hash.h:97
THashSet< TInt > TIntSet
Definition: shash.h:1382
double GetUniDev()
Definition: dt.h:30
TVal1 Val1
Definition: ds.h:34
TVal2 Val2
Definition: ds.h:35
void Gradient()
Update partial derivatives of log-likelihood.
Definition: circles.h:456
bool IsNode(const int &NId) const
Tests whether ID NId is a node.
Definition: graph.h:235
TInt K
Definition: circles.h:76
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
TCRef CRef
Definition: circles.h:67
bool IsEdge(const int &SrcNId, const int &DstNId) const
Tests whether an edge between node IDs SrcNId and DstNId exists in the graph.
Definition: graph.cpp:137
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
TFlt Loss(TIntSet &l, TIntSet lHat, int N, int Which)
Compute the loss between a GroundTruth cluster l and a predicted cluster lHat.
Definition: circles.h:103
Definition: dt.h:971
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
TFlt LogLikelihood()
Compute the log-likelihood of Parameters and cluster assignments.
Definition: circles.h:499
TVec< TInt > NodeIDs
Definition: circles.h:20
TFlt * Theta
Definition: circles.h:70
PGraphAttributes GraphAttributes
Definition: circles.h:74