SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
circles.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include "stdafx.h"
4 
6 public:
11  TGraphAttributes(PUNGraph G, const char* nodeFeaturePath, const char* groundtruthPath);
13  }
14 
17 
22 
23  TVec<TIntSet> GroundTruth; // Groundtruth communities
24 };
25 
27 
28 class TCluster {
29 public:
36  GraphAttributes(GraphAttributes), K(K), Lambda(Lambda) {
37  Theta = new TFlt[K * GraphAttributes->NFeatures];
38  Derivative = new TFlt[K * GraphAttributes->NFeatures];
39  for (int k = 0; k < K; k++) {
40  for (int f = 0; f < GraphAttributes->NFeatures; f++) {
41  Theta[k * GraphAttributes->NFeatures + f] = 0;
42  Derivative[k * GraphAttributes->NFeatures + f] = 0;
43  }
44 
45  // Clusters are initially empty.
46  CHat.Add(TIntSet());
47  }
48  }
50  delete[] Theta;
51  delete[] Derivative;
52  }
53 
59  void Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps);
60 
65  return CHat;
66  }
68 
69 private:
70  TFlt* Theta; // Community parameters
71  TFlt* Derivative; // Partial derivatives
72 
73  TVec<TIntSet> CHat; // Latent community memberships
74  PGraphAttributes GraphAttributes; // Graph with attributes
75 
78 
83 
89  TIntSet MCMC(TInt k, TInt MCMCReps);
90  void Gradient();
91 };
92 
94 
96 {
97  zeroOne = 0,
99  fScore = 2
100 };
101 
103 TFlt Loss(TIntSet& l, TIntSet lHat, int N, int Which)
104 {
105  if (l.Len() == 0) {
106  if (lHat.Len() == 0) {
107  return 0;
108  }
109  return 1.0;
110  }
111  if (lHat.Len() == 0) {
112  if (l.Len() == 0) {
113  return 0;
114  }
115  return 1.0;
116  }
117  TInt TruePositives = 0;
118  TInt FalsePositives = 0;
119  TInt FalseNegatives = 0;
120 
121  TFlt LabelLoss = 0;
122  for (THashSetKeyI<TInt> it = l.BegI(); it != l.EndI(); it ++) {
123  int c = it.GetKey();
124  if (!lHat.IsKey(c)) {
125  // false negative
126  FalseNegatives ++;
127  if (Which == zeroOne) {
128  LabelLoss += 1.0/N;
129  }
130  else if (Which == balancedError) {
131  LabelLoss += 0.5/l.Len();
132  }
133  }
134  }
135 
136  for (THashSetKeyI<TInt> it = lHat.BegI(); it != lHat.EndI(); it ++) {
137  int c = it.GetKey();
138  if (!l.IsKey(c)) {
139  // false positive
140  FalsePositives ++;
141  if (Which == zeroOne) {
142  LabelLoss += 1.0/N;
143  }
144  else if (Which == balancedError) {
145  LabelLoss += 0.5/(N - l.Len());
146  }
147  }
148  else {
149  TruePositives ++;
150  }
151  }
152 
153  if ((lHat.Len() == 0 || TruePositives == 0) && Which == fScore) {
154  return 1.0;
155  }
156  TFlt precision = (1.0*TruePositives)/lHat.Len();
157  TFlt recall = (1.0*TruePositives)/l.Len();
158  if (Which == fScore) {
159  return 1 - 2 * (precision*recall) / (precision + recall);
160  }
161 
162  return LabelLoss;
163 }
164 
165 /*
167 // Requires code for the munkres algorithm, available at https://github.com/saebyn/munkres-cpp
168 TFlt TotalLoss(TVec<TIntSet>& Clusters, TVec<TIntSet>& CHat, int N, int Which)
169 {
170  Matrix<double> matrix(Clusters.Len(), CHat.Len());
171 
172  for (int i = 0; i < (int) Clusters.Len(); i ++) {
173  for (int j = 0; j < (int) CHat.Len(); j ++) {
174  matrix(i,j) = Loss(Clusters[i], CHat[j], N, Which);
175  }
176  }
177 
178  Munkres m;
179  m.solve(matrix);
180 
181  TFlt l = 0;
182  for (int i = 0; i < (int) Clusters.Len(); i ++) {
183  for (int j = 0; j < (int) CHat.Len(); j ++) {
184  if (matrix(i,j) == 0) {
185  l += Loss(Clusters[i], CHat[j], N, Which);
186  }
187  }
188  }
189 
190  return l / (Clusters.Len() < CHat.Len() ? Clusters.Len() : CHat.Len());
191 }
192 */
193 
194 TGraphAttributes::TGraphAttributes(PUNGraph G, const char* NodeFeaturePath,
195  const char* GroundTruthPath) :
196  G(G) {
197  // Read node Features for the graph
198  FILE* f = fopen(NodeFeaturePath, "r");
199  int NNodes;
200  int nF;
201  fscanf(f, "%d %d", &NNodes, &nF);
202  NFeatures = nF;
203 
204  for (int i = 0; i < NNodes; i++) {
205  int nid;
206  fscanf(f, "%d", &nid);
207 
208  if (!G->IsNode(nid)) {
209  printf("Warning: %d is not a node in G.\n", nid);
210  }
211  TInt kv = NodeFeatures.AddKey(nid);
212  for (int x = 0; x < nF; x++) {
213  int z = 0;
214  fscanf(f, "%d", &z);
215  if (z) {
216  NodeFeatures[kv].AddDat(x) = z;
217  }
218  }
219  if (G->IsNode(nid)) {
220  NodeIDs.Add(nid);
221  }
222  }
223  fclose(f);
224 
225  f = fopen(GroundTruthPath, "r");
226  if (f == NULL) {
227  printf("Groundtruth file %s not found.\n", GroundTruthPath);
228  }
229  else {
230  char* CircleName = new char [1000];
231  while (fscanf(f, "%s", CircleName) == 1)
232  {
233  TIntSet Circle;
234  while (true) {
235  int nid;
236  fscanf(f, "%d", &nid);
237  Circle.AddKey(nid);
238  char c;
239  while (true) {
240  c = fgetc(f);
241  if (c == '\n') break;
242  if (c >= '0' && c <= '9') {
243  fseek(f, -1, SEEK_CUR);
244  break;
245  }
246  }
247  if (c == '\n') break;
248  }
249  GroundTruth.Add(Circle);
250  }
251  delete [] CircleName;
252  }
253  fclose(f);
254 
255  // Construct the Features encoding the difference between node attributes.
256  for (int i = 0; i < NodeIDs.Len(); i++) {
257  TInt ni = NodeIDs[i];
258  for (int j = i + 1; j < NodeIDs.Len(); j++) {
259  TInt nj = NodeIDs[j];
260  TInt kv = EdgeFeatures.AddKey(TIntPr(ni, nj));
261  for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(ni).BegI();
262  !it.IsEnd(); it++) {
263  TInt k = it.GetKey();
264  TInt diff = 0;
265  if (NodeFeatures.GetDat(nj).IsKey(k)) {
266  diff = abs(it.GetDat() - NodeFeatures.GetDat(nj).GetDat(k));
267  } else {
268  diff = abs(it.GetDat());
269  }
270  if (diff) {
271  EdgeFeatures[kv].AddDat(k) = diff;
272  }
273  }
274  for (THashKeyDatI<TInt, TInt> it = NodeFeatures.GetDat(nj).BegI();
275  !it.IsEnd(); it++) {
276  TInt k = it.GetKey();
277  TInt diff = 0;
278  if (NodeFeatures.GetDat(ni).IsKey(k)) {
279  diff = abs(it.GetDat() - NodeFeatures.GetDat(ni).GetDat(k));
280  } else {
281  diff = abs(it.GetDat());
282  }
283  if (diff) {
284  EdgeFeatures[kv].AddDat(k) = diff;
285  }
286  }
287  }
288  }
289 }
290 
292 void TCluster::Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps) {
293  // Learning rate
294  TFlt Increment = 1.0 / (1.0 * GraphAttributes->NodeIDs.Len() * GraphAttributes->NodeIDs.Len());
295  TRnd t;
296 
297  for (int OuterRep = 0; OuterRep < OuterReps; OuterRep++) {
298  // If it's the first iteration or the solution is degenerate,
299  // randomly initialize the weights and Clusters
300  for (int k = 0; k < K; k++) {
301  if (OuterRep == 0 || CHat[k].Empty() || CHat[k].Len()
302  == GraphAttributes->NodeIDs.Len()) {
303  CHat[k].Clr();
304  for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
305  if (t.GetUniDevInt(2) == 0) {
306  CHat[k].AddKey(GraphAttributes->NodeIDs[i]);
307  }
308  }
309  for (int i = 0; i < GraphAttributes->NFeatures; i++) {
310  Theta[k * GraphAttributes->NFeatures + i] = 0;
311  }
312  // Just set a single Feature to 1 as a random initialization.
314  Theta[k * GraphAttributes->NFeatures] = 1;
315  }
316  }
317 
318  for (int k = 0; k < K; k++) {
319  CHat[k] = MCMC(k, MCMCReps);
320  }
321  TFlt llPrevious = LogLikelihood();
322 
323  // Perform gradient ascent
324  TFlt ll = 0;
325  for (int gradientRep = 0; gradientRep < GradientReps; gradientRep++) {
326  Gradient();
327  for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
328  Theta[i] += Increment * Derivative[i];
329  }
330  printf(".");
331  fflush( stdout);
332  ll = LogLikelihood();
333 
334  // If we reduced the objective, undo the update and stop.
335  if (ll < llPrevious) {
336  for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
337  Theta[i] -= Increment * Derivative[i];
338  }
339  ll = llPrevious;
340  break;
341  }
342  llPrevious = ll;
343  }
344  printf("\nIteration %d, ll = %f\n", OuterRep + 1, (double) ll);
345  }
346 }
347 
349 TFlt Inner(TIntIntH& Feature, TFlt* Parameter) {
350  TFlt res = 0;
351  for (THashKeyDatI<TInt, TInt> it = Feature.BegI(); !it.IsEnd(); it++) {
352  res += it.GetDat() * Parameter[it.GetKey()];
353  }
354  return res;
355 }
356 
359  TRnd t;
360 
361  THash<TInt, TFlt> CostNotIncludeHash;
362  THash<TInt, TFlt> CostIncludeHash;
363 
364  TVec<TInt> NewLabel;
365  int csize = 0;
366  for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
367  if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) {
368  NewLabel.Add(0);
369  } else {
370  NewLabel.Add(1);
371  }
372  if (CHat[k].IsKey(GraphAttributes->NodeIDs[i])) {
373  csize++;
374  }
375  }
376  // Compute edge log-probabilities.
378  !it.IsEnd(); it++) {
379  TIntPr e = it.GetKey();
380  TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(e);
381  TInt Src = e.Val1;
382  TInt Dst = e.Val2;
383 
384  TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
385  TFlt InnerProduct = Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
386  TFlt Other = 0;
387  for (int l = 0; l < K; l++) {
388  if (l == k) {
389  continue;
390  }
391  TFlt d = (CHat[l].IsKey(Src) && CHat[l].IsKey(Dst)) ? 1 : -1;
392  Other += d * Inner(it.GetDat(), Theta + l * GraphAttributes->NFeatures);
393  }
394 
395  TFlt CostNotInclude;
396  TFlt CostInclude;
397 
398  if (Exists) {
399  CostNotInclude = -Other + InnerProduct + log(1 + exp(Other - InnerProduct));
400  CostInclude = -Other - InnerProduct + log(1 + exp(Other + InnerProduct));
401  } else {
402  CostNotInclude = log(1 + exp(Other - InnerProduct));
403  CostInclude = log(1 + exp(Other + InnerProduct));
404  }
405 
406  CostNotIncludeHash.AddDat(kv) = -CostNotInclude;
407  CostIncludeHash.AddDat(kv) = -CostInclude;
408  }
409 
410  // Run MCMC using precomputed probabilities
411  TFlt InitialTemperature = 1.0; // Initial temperature
412  for (int r = 2; r < MCMCReps + 2; r++) {
413  TFlt Temperature = InitialTemperature / log((double) r);
414  for (int n = 0; n < GraphAttributes->NodeIDs.Len(); n++) {
415  TFlt l0 = 0;
416  TFlt l1 = 0;
417  for (int np = 0; np < GraphAttributes->NodeIDs.Len(); np++) {
418  if (n == np) {
419  continue;
420  }
422  if (ed.Val1 > ed.Val2) {
423  ed = TIntPr(ed.Val2, ed.Val1);
424  }
425  TInt kv = GraphAttributes->EdgeFeatures.GetKeyId(ed);
426  TFlt m0 = CostNotIncludeHash.GetDat(kv);
427  if (NewLabel[np] == 0) {
428  l0 += m0;
429  l1 += m0;
430  } else {
431  l0 += m0;
432  l1 += CostIncludeHash.GetDat(kv);
433  }
434  }
435  TFlt LogLikelihoodDiff = exp(l1 - l0);
436  TFlt AcceptProb = pow(LogLikelihoodDiff, 1.0 / Temperature);
437  if (t.GetUniDev() < AcceptProb) {
438  NewLabel[n] = 1;
439  } else {
440  NewLabel[n] = 0;
441  }
442  }
443  }
444 
445  TIntSet Result;
446  for (int i = 0; i < GraphAttributes->NodeIDs.Len(); i++) {
447  if (NewLabel[i]) {
448  Result.AddKey(GraphAttributes->NodeIDs[i]);
449  }
450  }
451 
452  return Result;
453 }
454 
456 void TCluster::Gradient(void) {
457  for (int i = 0; i < K * GraphAttributes->NFeatures; i++) {
458  if (Theta[i] > 0) {
459  Derivative[i] = -Lambda * Theta[i];
460  } else {
461  Derivative[i] = Lambda * Theta[i];
462  }
463  }
464 
466  !it.IsEnd(); it++) {
467  TFlt InnerProduct = 0;
468  TIntPr Edge = it.GetKey();
469  TInt Src = Edge.Val1;
470  TInt Dst = Edge.Val2;
471  TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
472  for (int k = 0; k < K; k++) {
473  TFlt d = CHat[k].IsKey(Src) && CHat[k].IsKey(Dst) ? 1 : -1;
474  InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
475  }
476  TFlt expinp = exp(InnerProduct);
477  TFlt q = expinp / (1 + expinp);
478  if (q != q) {
479  q = 1; // Test for nan in case of overflow.
480  }
481 
482  for (int k = 0; k < K; k++) {
483  TBool d_ = CHat[k].IsKey(Src) && CHat[k].IsKey(Dst);
484  TFlt d = d_ ? 1 : -1;
485  for (THashKeyDatI<TInt, TInt> itf = it.GetDat().BegI();
486  !itf.IsEnd(); itf++) {
487  TInt i = itf.GetKey();
488  TInt f = itf.GetDat();
489  if (Exists) {
490  Derivative[k * GraphAttributes->NFeatures + i] += d * f;
491  }
492  Derivative[k * GraphAttributes->NFeatures + i] += -d * f * q;
493  }
494  }
495  }
496 }
497 
500  TFlt ll = 0;
502  !it.IsEnd(); it++) {
503  TFlt InnerProduct = 0;
504  TIntPr Edge = it.GetKey();
505  TInt Src = Edge.Val1;
506  TInt Dst = Edge.Val2;
507  TBool Exists = GraphAttributes->G->IsEdge(Src, Dst);
508 
509  for (int k = 0; k < K; k++) {
510  TFlt d = CHat[k].IsKey(Src) && CHat[k].IsKey(Dst) ? 1 : -1;
511  InnerProduct += d * Inner(it.GetDat(), Theta + k * GraphAttributes->NFeatures);
512  }
513  if (Exists) {
514  ll += InnerProduct;
515  }
516  TFlt ll_ = log(1 + exp(InnerProduct));
517  ll += -ll_;
518  }
519 
520  if (ll != ll) {
521  printf("ll isnan\n");
522  exit(1);
523  }
524  return ll;
525 }
Definition: bd.h:440
TPt< TCluster > PCluster
Definition: circles.h:93
TFlt Lambda
Definition: circles.h:77
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
lossType
Definition: circles.h:95
TIntSet MCMC(TInt k, TInt MCMCReps)
Optimize the cluster assignments for the k'th cluster.
Definition: circles.h:358
TVec< TIntSet > CHat
Definition: circles.h:73
Definition: dt.h:11
#define SEEK_CUR
Definition: fl.cpp:968
TIter BegI() const
Definition: shash.h:1105
TIter BegI() const
Definition: hash.h:213
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TCluster(PGraphAttributes GraphAttributes, TInt K, TFlt Lambda)
Definition: circles.h:35
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
TVec< TIntSet > GetCircles(void)
Definition: circles.h:64
bool IsKey(const TKey &Key) const
Definition: shash.h:1148
TPt< TGraphAttributes > PGraphAttributes
Definition: circles.h:26
Definition: dt.h:1386
Definition: circles.h:99
void Train(TInt OuterReps, TInt GradientReps, TInt MCMCReps)
Train the model to predict K Clusters.
Definition: circles.h:292
THash< TInt, TIntIntH > NodeFeatures
Definition: circles.h:18
const TDat & GetDat() const
Definition: hash.h:81
bool IsEnd() const
Tests whether the iterator is pointing to the past-end element.
Definition: hash.h:78
THash< TIntPr, TIntIntH > EdgeFeatures
Definition: circles.h:19
TGraphAttributes(PUNGraph G, const char *nodeFeaturePath, const char *groundtruthPath)
Definition: circles.h:194
int AddKey(const TKey &Key)
Definition: shash.h:1254
TVec< TIntSet > GroundTruth
Definition: circles.h:23
Definition: dt.h:1137
TInt NFeatures
Definition: circles.h:16
~TCluster()
Definition: circles.h:49
TFlt * Derivative
Definition: circles.h:71
PUNGraph G
Definition: circles.h:15
TIter EndI() const
Definition: shash.h:1112
int Len() const
Definition: shash.h:1121
Definition: ds.h:32
TFlt Inner(TIntIntH &Feature, TFlt *Parameter)
Inner product for sparse features.
Definition: circles.h:349
Definition: hash.h:97
THashSet< TInt > TIntSet
Definition: shash.h:1382
double GetUniDev()
Definition: dt.h:30
TVal1 Val1
Definition: ds.h:34
TVal2 Val2
Definition: ds.h:35
void Gradient()
Update partial derivatives of log-likelihood.
Definition: circles.h:456
bool IsNode(const int &NId) const
Tests whether ID NId is a node.
Definition: graph.h:235
TInt K
Definition: circles.h:76
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
TCRef CRef
Definition: circles.h:67
bool IsEdge(const int &SrcNId, const int &DstNId) const
Tests whether an edge between node IDs SrcNId and DstNId exists in the graph.
Definition: graph.cpp:137
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
TFlt Loss(TIntSet &l, TIntSet lHat, int N, int Which)
Compute the loss between a GroundTruth cluster l and a predicted cluster lHat.
Definition: circles.h:103
Definition: dt.h:974
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
TFlt LogLikelihood()
Compute the log-likelihood of Parameters and cluster assignments.
Definition: circles.h:499
TVec< TInt > NodeIDs
Definition: circles.h:20
TFlt * Theta
Definition: circles.h:70
PGraphAttributes GraphAttributes
Definition: circles.h:74