SNAP Library 4.0, Developer Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
agmfit.cpp
Go to the documentation of this file.
1 #include "stdafx.h"
2 #include "agmfit.h"
3 #include "agm.h"
4 
6 // AGM fitting
7 
8 void TAGMFit::Save(TSOut& SOut) {
9  G->Save(SOut);
10  CIDNSetV.Save(SOut);
11  EdgeComVH.Save(SOut);
12  NIDComVH.Save(SOut);
13  ComEdgesV.Save(SOut);
14  PNoCom.Save(SOut);
15  LambdaV.Save(SOut);
16  NIDCIDPrH.Save(SOut);
17  NIDCIDPrS.Save(SOut);
18  MinLambda.Save(SOut);
19  MaxLambda.Save(SOut);
20  RegCoef.Save(SOut);
21  BaseCID.Save(SOut);
22 }
23 
24 void TAGMFit::Load(TSIn& SIn, const int& RndSeed) {
25  G = TUNGraph::Load(SIn);
26  CIDNSetV.Load(SIn);
27  EdgeComVH.Load(SIn);
28  NIDComVH.Load(SIn);
29  ComEdgesV.Load(SIn);
30  PNoCom.Load(SIn);
31  LambdaV.Load(SIn);
32  NIDCIDPrH.Load(SIn);
33  NIDCIDPrS.Load(SIn);
34  MinLambda.Load(SIn);
35  MaxLambda.Load(SIn);
36  RegCoef.Load(SIn);
37  BaseCID.Load(SIn);
38  Rnd.PutSeed(RndSeed);
39 }
40 
41 // Randomly initialize bipartite community affiliation graph.
42 void TAGMFit::RandomInitCmtyVV(const int InitComs, const double ComSzAlpha, const double MemAlpha, const int MinComSz, const int MaxComSz, const int MinMem, const int MaxMem) {
43  TVec<TIntV> InitCmtyVV(InitComs, 0);
44  TAGMUtil::GenCmtyVVFromPL(InitCmtyVV, G, G->GetNodes(), InitComs, ComSzAlpha, MemAlpha, MinComSz, MaxComSz,
45  MinMem, MaxMem, Rnd);
46  SetCmtyVV(InitCmtyVV);
47 }
48 
49 // For each (u, v) in edges, precompute C_uv (the set of communities u and v share).
51  ComEdgesV.Gen(CIDNSetV.Len());
52  EdgeComVH.Gen(G->GetEdges());
53  for (TUNGraph::TNodeI SrcNI = G->BegNI(); SrcNI < G->EndNI(); SrcNI++) {
54  int SrcNID = SrcNI.GetId();
55  for (int v = 0; v < SrcNI.GetDeg(); v++) {
56  int DstNID = SrcNI.GetNbrNId(v);
57  if (SrcNID >= DstNID) { continue; }
58  TIntSet JointCom;
59  IAssert(NIDComVH.IsKey(SrcNID));
60  IAssert(NIDComVH.IsKey(DstNID));
61  TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID), NIDComVH.GetDat(DstNID), JointCom);
62  EdgeComVH.AddDat(TIntPr(SrcNID,DstNID),JointCom);
63  for (int k = 0; k < JointCom.Len(); k++) {
64  ComEdgesV[JointCom[k]]++;
65  }
66  }
67  }
68  IAssert(EdgeComVH.Len() == G->GetEdges());
69 }
70 
71 // Set epsilon by the default value.
73  PNoCom = 1.0 / (double) G->GetNodes() / (double) G->GetNodes();
74 }
75 
76 double TAGMFit::Likelihood(const TFltV& NewLambdaV, double& LEdges, double& LNoEdges) {
77  IAssert(CIDNSetV.Len() == NewLambdaV.Len());
78  IAssert(ComEdgesV.Len() == CIDNSetV.Len());
79  LEdges = 0.0; LNoEdges = 0.0;
80  for (int e = 0; e < EdgeComVH.Len(); e++) {
81  TIntSet& JointCom = EdgeComVH[e];
82  double LambdaSum = SelectLambdaSum(NewLambdaV, JointCom);
83  double Puv = 1 - exp(- LambdaSum);
84  if (JointCom.Len() == 0) { Puv = PNoCom; }
85  IAssert(! _isnan(log(Puv)));
86  LEdges += log(Puv);
87  }
88  for (int k = 0; k < NewLambdaV.Len(); k++) {
89  int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2;
90  int NotEdgesInCom = MaxEk - ComEdgesV[k];
91  if(NotEdgesInCom > 0) {
92  if (LNoEdges >= TFlt::Mn + double(NotEdgesInCom) * NewLambdaV[k]) {
93  LNoEdges -= double(NotEdgesInCom) * NewLambdaV[k];
94  }
95  }
96  }
97  double LReg = 0.0;
98  if (RegCoef > 0.0) {
99  LReg = - RegCoef * TLinAlg::SumVec(NewLambdaV);
100  }
101  return LEdges + LNoEdges + LReg;
102 }
103 
105  return Likelihood(LambdaV);
106 }
107 
108 // Step size search for updating P_c (which is parametarized by lambda).
109 double TAGMFit::GetStepSizeByLineSearchForLambda(const TFltV& DeltaV, const TFltV& GradV, const double& Alpha, const double& Beta) {
110  double StepSize = 1.0;
111  double InitLikelihood = Likelihood();
112  IAssert(LambdaV.Len() == DeltaV.Len());
113  TFltV NewLambdaV(LambdaV.Len());
114  for (int iter = 0; ; iter++) {
115  for (int i = 0; i < LambdaV.Len(); i++) {
116  NewLambdaV[i] = LambdaV[i] + StepSize * DeltaV[i];
117  if (NewLambdaV[i] < MinLambda) { NewLambdaV[i] = MinLambda; }
118  if (NewLambdaV[i] > MaxLambda) { NewLambdaV[i] = MaxLambda; }
119  }
120  if (Likelihood(NewLambdaV) < InitLikelihood + Alpha * StepSize * TLinAlg::DotProduct(GradV, DeltaV)) {
121  StepSize *= Beta;
122  } else {
123  break;
124  }
125  }
126  return StepSize;
127 }
128 
129 // Gradient descent for p_c while fixing community affiliation graph (CAG).
130 int TAGMFit::MLEGradAscentGivenCAG(const double& Thres, const int& MaxIter, const TStr PlotNm) {
131  int Edges = G->GetEdges();
132  TExeTm ExeTm;
133  TFltV GradV(LambdaV.Len());
134  int iter = 0;
135  TIntFltPrV IterLV, IterGradNormV;
136  double GradCutOff = 1000;
137  for (iter = 0; iter < MaxIter; iter++) {
138  GradLogLForLambda(GradV); //if gradient is going out of the boundary, cut off
139  for (int i = 0; i < LambdaV.Len(); i++) {
140  if (GradV[i] < -GradCutOff) { GradV[i] = -GradCutOff; }
141  if (GradV[i] > GradCutOff) { GradV[i] = GradCutOff; }
142  if (LambdaV[i] <= MinLambda && GradV[i] < 0) { GradV[i] = 0.0; }
143  if (LambdaV[i] >= MaxLambda && GradV[i] > 0) { GradV[i] = 0.0; }
144  }
145  double Alpha = 0.15, Beta = 0.2;
146  if (Edges > Kilo(100)) { Alpha = 0.00015; Beta = 0.3;}
147  double LearnRate = GetStepSizeByLineSearchForLambda(GradV, GradV, Alpha, Beta);
148  if (TLinAlg::Norm(GradV) < Thres) { break; }
149  for (int i = 0; i < LambdaV.Len(); i++) {
150  double Change = LearnRate * GradV[i];
151  LambdaV[i] += Change;
152  if(LambdaV[i] < MinLambda) { LambdaV[i] = MinLambda;}
153  if(LambdaV[i] > MaxLambda) { LambdaV[i] = MaxLambda;}
154  }
155  if (! PlotNm.Empty()) {
156  double L = Likelihood();
157  IterLV.Add(TIntFltPr(iter, L));
158  IterGradNormV.Add(TIntFltPr(iter, TLinAlg::Norm(GradV)));
159  }
160  }
161  if (! PlotNm.Empty()) {
162  TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
163  TGnuPlot::PlotValV(IterGradNormV, PlotNm + ".gradnorm_Q");
164  printf("MLE for Lambda completed with %d iterations(%s)\n",iter,ExeTm.GetTmStr());
165  }
166  return iter;
167 }
168 
169 void TAGMFit::RandomInit(const int& MaxK) {
170  CIDNSetV.Clr();
171  for (int c = 0; c < MaxK; c++) {
172  CIDNSetV.Add();
173  int NC = Rnd.GetUniDevInt(G -> GetNodes());
174  TUNGraph::TNodeI NI = G -> GetRndNI();
175  CIDNSetV.Last().AddKey(NI.GetId());
176  for (int v = 0; v < NC; v++) {
177  NI = G->GetNI(NI.GetNbrNId(Rnd.GetUniDevInt(NI.GetDeg())));
178  CIDNSetV.Last().AddKey(NI.GetId());
179  }
180  }
181  InitNodeData();
183 }
184 
185 // Initialize node community memberships using best neighborhood communities (see D. Gleich et al. KDD'12).
186 void TAGMFit::NeighborComInit(const int InitComs) {
187  CIDNSetV.Gen(InitComs);
188  const int Edges = G->GetEdges();
189  TFltIntPrV NIdPhiV(G->GetNodes(), 0);
190  TIntSet InvalidNIDS(G->GetNodes());
191  TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG
192  TExeTm RunTm;
193  //compute conductance of neighborhood community
194  TIntV NIdV;
195  G->GetNIdV(NIdV);
196  for (int u = 0; u < NIdV.Len(); u++) {
197  TIntSet NBCmty(G->GetNI(NIdV[u]).GetDeg() + 1);
198  double Phi;
199  if (G->GetNI(NIdV[u]).GetDeg() < 5) { //do not include nodes with too few degree
200  Phi = 1.0;
201  } else {
202  TAGMUtil::GetNbhCom(G, NIdV[u], NBCmty);
203  IAssert(NBCmty.Len() == G->GetNI(NIdV[u]).GetDeg() + 1);
204  Phi = TAGMUtil::GetConductance(G, NBCmty, Edges);
205  }
206  NIdPhiV.Add(TFltIntPr(Phi, NIdV[u]));
207  }
208  NIdPhiV.Sort(true);
209  printf("conductance computation completed [%s]\n", RunTm.GetTmStr());
210  fflush(stdout);
211  //choose nodes with local minimum in conductance
212  int CurCID = 0;
213  for (int ui = 0; ui < NIdPhiV.Len(); ui++) {
214  int UID = NIdPhiV[ui].Val2;
215  fflush(stdout);
216  if (InvalidNIDS.IsKey(UID)) { continue; }
217  ChosenNIDV.Add(UID); //FOR DEBUG
218  //add the node and its neighbors to the current community
219  CIDNSetV[CurCID].AddKey(UID);
220  TUNGraph::TNodeI NI = G->GetNI(UID);
221  fflush(stdout);
222  for (int e = 0; e < NI.GetDeg(); e++) {
223  CIDNSetV[CurCID].AddKey(NI.GetNbrNId(e));
224  }
225  //exclude its neighbors from the next considerations
226  for (int e = 0; e < NI.GetDeg(); e++) {
227  InvalidNIDS.AddKey(NI.GetNbrNId(e));
228  }
229  CurCID++;
230  fflush(stdout);
231  if (CurCID >= InitComs) { break; }
232  }
233  if (InitComs > CurCID) {
234  printf("%d communities needed to fill randomly\n", InitComs - CurCID);
235  }
236  //assign a member to zero-member community (if any)
237  for (int c = 0; c < CIDNSetV.Len(); c++) {
238  if (CIDNSetV[c].Len() == 0) {
239  int ComSz = 10;
240  for (int u = 0; u < ComSz; u++) {
241  int UID = G->GetRndNI().GetId();
242  CIDNSetV[c].AddKey(UID);
243  }
244  }
245  }
246  InitNodeData();
248 }
249 
250 // Add epsilon community (base community which includes all nodes) into community affiliation graph. It means that we fit for epsilon.
252  TVec<TIntV> CmtyVV;
253  GetCmtyVV(CmtyVV);
254  TIntV TmpV = CmtyVV[0];
255  CmtyVV.Add(TmpV);
256  G->GetNIdV(CmtyVV[0]);
257  IAssert(CIDNSetV.Len() + 1 == CmtyVV.Len());
258  SetCmtyVV(CmtyVV);
259  InitNodeData();
260  BaseCID = 0;
261  PNoCom = 0.0;
262 }
263 
266  NIDComVH.Gen(G->GetNodes());
267  for (TUNGraph::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) {
268  NIDComVH.AddDat(NI.GetId());
269  }
271  GetEdgeJointCom();
272  LambdaV.Gen(CIDNSetV.Len());
273  for (int c = 0; c < CIDNSetV.Len(); c++) {
274  int MaxE = (CIDNSetV[c].Len()) * (CIDNSetV[c].Len() - 1) / 2;
275  if (MaxE < 2) {
276  LambdaV[c] = MaxLambda;
277  }
278  else{
279  LambdaV[c] = -log((double) (MaxE - ComEdgesV[c]) / MaxE);
280  }
281  if (LambdaV[c] > MaxLambda) { LambdaV[c] = MaxLambda; }
282  if (LambdaV[c] < MinLambda) { LambdaV[c] = MinLambda; }
283  }
284  NIDCIDPrS.Gen(G->GetNodes() * 10);
285  for (int c = 0; c < CIDNSetV.Len(); c++) {
286  for (TIntSet::TIter SI = CIDNSetV[c].BegI(); SI < CIDNSetV[c].EndI(); SI++) {
287  NIDCIDPrS.AddKey(TIntPr(SI.GetKey(), c));
288  }
289  }
290 }
291 
292 // After MCMC, NID leaves community CID.
293 void TAGMFit::LeaveCom(const int& NID, const int& CID) {
294  TUNGraph::TNodeI NI = G->GetNI(NID);
295  for (int e = 0; e < NI.GetDeg(); e++) {
296  int VID = NI.GetNbrNId(e);
297  if (NIDComVH.GetDat(VID).IsKey(CID)) {
298  TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID));
299  EdgeComVH.GetDat(SrcDstNIDPr).DelKey(CID);
300  ComEdgesV[CID]--;
301  }
302  }
303  CIDNSetV[CID].DelKey(NID);
304  NIDComVH.GetDat(NID).DelKey(CID);
305  NIDCIDPrS.DelKey(TIntPr(NID, CID));
306 }
307 
308 // After MCMC, NID joins community CID.
309 void TAGMFit::JoinCom(const int& NID, const int& JoinCID) {
310  TUNGraph::TNodeI NI = G->GetNI(NID);
311  for (int e = 0; e < NI.GetDeg(); e++) {
312  int VID = NI.GetNbrNId(e);
313  if (NIDComVH.GetDat(VID).IsKey(JoinCID)) {
314  TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID));
315  EdgeComVH.GetDat(SrcDstNIDPr).AddKey(JoinCID);
316  ComEdgesV[JoinCID]++;
317  }
318  }
319  CIDNSetV[JoinCID].AddKey(NID);
320  NIDComVH.GetDat(NID).AddKey(JoinCID);
321  NIDCIDPrS.AddKey(TIntPr(NID, JoinCID));
322 }
323 
324 // Sample transition: Choose among (join, leave, switch), and then sample (NID, CID).
325 void TAGMFit::SampleTransition(int& NID, int& JoinCID, int& LeaveCID, double& DeltaL) {
326  int Option = Rnd.GetUniDevInt(3); //0:Join 1:Leave 2:Switch
327  if (NIDCIDPrS.Len() <= 1) { Option = 0; } //if there is only one node membership, only join is possible.
328  int TryCnt = 0;
329  const int MaxTryCnt = G->GetNodes();
330  DeltaL = TFlt::Mn;
331  if (Option == 0) {
332  do {
333  JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len());
334  NID = G->GetRndNId();
335  } while (TryCnt++ < MaxTryCnt && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID)));
336  if (TryCnt < MaxTryCnt) { //if successfully find a move
337  DeltaL = SeekJoin(NID, JoinCID);
338  }
339  }
340  else if (Option == 1) {
341  do {
343  NID = NIDCIDPr.Val1;
344  LeaveCID = NIDCIDPr.Val2;
345  } while (TryCnt++ < MaxTryCnt && LeaveCID == BaseCID);
346  if (TryCnt < MaxTryCnt) {//if successfully find a move
347  DeltaL = SeekLeave(NID, LeaveCID);
348  }
349  }
350  else{
351  do {
353  NID = NIDCIDPr.Val1;
354  LeaveCID = NIDCIDPr.Val2;
355  } while (TryCnt++ < MaxTryCnt && (NIDComVH.GetDat(NID).Len() == CIDNSetV.Len() || LeaveCID == BaseCID));
356  do {
357  JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len());
358  } while (TryCnt++ < G->GetNodes() && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID)));
359  if (TryCnt < MaxTryCnt) {//if successfully find a move
360  DeltaL = SeekSwitch(NID, LeaveCID, JoinCID);
361  }
362  }
363 }
364 
366 void TAGMFit::RunMCMC(const int& MaxIter, const int& EvalLambdaIter, const TStr& PlotFPrx) {
367  TExeTm IterTm, TotalTm;
368  double PrevL = Likelihood(), DeltaL = 0;
369  double BestL = PrevL;
370  printf("initial likelihood = %f\n",PrevL);
371  TIntFltPrV IterTrueLV, IterJoinV, IterLeaveV, IterAcceptV, IterSwitchV, IterLBV;
372  TIntPrV IterTotMemV;
373  TIntV IterV;
374  TFltV BestLV;
375  TVec<TIntSet> BestCmtySetV;
376  int SwitchCnt = 0, LeaveCnt = 0, JoinCnt = 0, AcceptCnt = 0, ProbBinSz;
377  int Nodes = G->GetNodes(), Edges = G->GetEdges();
378  TExeTm PlotTm;
379  ProbBinSz = TMath::Mx(1000, G->GetNodes() / 10); //bin to compute probabilities
380  IterLBV.Add(TIntFltPr(1, BestL));
381 
382  for (int iter = 0; iter < MaxIter; iter++) {
383  IterTm.Tick();
384  int NID = -1;
385  int JoinCID = -1, LeaveCID = -1;
386  SampleTransition(NID, JoinCID, LeaveCID, DeltaL); //sample a move
387  double OptL = PrevL;
388  if (DeltaL > 0 || Rnd.GetUniDev() < exp(DeltaL)) { //if it is accepted
389  IterTm.Tick();
390  if (LeaveCID > -1 && LeaveCID != BaseCID) { LeaveCom(NID, LeaveCID); }
391  if (JoinCID > -1 && JoinCID != BaseCID) { JoinCom(NID, JoinCID); }
392  if (LeaveCID > -1 && JoinCID > -1 && JoinCID != BaseCID && LeaveCID != BaseCID) { SwitchCnt++; }
393  else if (LeaveCID > -1 && LeaveCID != BaseCID) { LeaveCnt++;}
394  else if (JoinCID > -1 && JoinCID != BaseCID) { JoinCnt++;}
395  AcceptCnt++;
396  if ((iter + 1) % EvalLambdaIter == 0) {
397  IterTm.Tick();
398  MLEGradAscentGivenCAG(0.01, 3);
399  OptL = Likelihood();
400  }
401  else{
402  OptL = PrevL + DeltaL;
403  }
404  if (BestL <= OptL && CIDNSetV.Len() > 0) {
405  BestCmtySetV = CIDNSetV;
406  BestLV = LambdaV;
407  BestL = OptL;
408  }
409  }
410  if (iter > 0 && (iter % ProbBinSz == 0) && PlotFPrx.Len() > 0) {
411  IterLBV.Add(TIntFltPr(iter, OptL));
412  IterSwitchV.Add(TIntFltPr(iter, (double) SwitchCnt / (double) AcceptCnt));
413  IterLeaveV.Add(TIntFltPr(iter, (double) LeaveCnt / (double) AcceptCnt));
414  IterJoinV.Add(TIntFltPr(iter, (double) JoinCnt / (double) AcceptCnt));
415  IterAcceptV.Add(TIntFltPr(iter, (double) AcceptCnt / (double) ProbBinSz));
416  SwitchCnt = JoinCnt = LeaveCnt = AcceptCnt = 0;
417  }
418  PrevL = OptL;
419  if ((iter + 1) % 10000 == 0) {
420  printf("\r%d iterations completed [%.2f]", iter, (double) iter / (double) MaxIter);
421  }
422  }
423 
424  // plot the likelihood and acceptance probabilities if the plot file name is given
425  if (PlotFPrx.Len() > 0) {
426  TGnuPlot GP1;
427  GP1.AddPlot(IterLBV, gpwLinesPoints, "likelihood");
428  GP1.SetDataPlotFNm(PlotFPrx + ".likelihood.tab", PlotFPrx + ".likelihood.plt");
429  TStr TitleStr = TStr::Fmt(" N:%d E:%d", Nodes, Edges);
430  GP1.SetTitle(PlotFPrx + ".likelihood" + TitleStr);
431  GP1.SavePng(PlotFPrx + ".likelihood.png");
432 
433  TGnuPlot GP2;
434  GP2.AddPlot(IterSwitchV, gpwLinesPoints, "Switch");
435  GP2.AddPlot(IterLeaveV, gpwLinesPoints, "Leave");
436  GP2.AddPlot(IterJoinV, gpwLinesPoints, "Join");
437  GP2.AddPlot(IterAcceptV, gpwLinesPoints, "Accept");
438  GP2.SetTitle(PlotFPrx + ".transition");
439  GP2.SetDataPlotFNm(PlotFPrx + "transition_prob.tab", PlotFPrx + "transition_prob.plt");
440  GP2.SavePng(PlotFPrx + "transition_prob.png");
441  }
442  CIDNSetV = BestCmtySetV;
443  LambdaV = BestLV;
444 
445  InitNodeData();
446  MLEGradAscentGivenCAG(0.001, 100);
447  printf("\nMCMC completed (best likelihood: %.2f) [%s]\n", BestL, TotalTm.GetTmStr());
448 }
449 
450 // Returns \v QV, a vector of (1 - p_c) for each community c.
451 void TAGMFit::GetQV(TFltV& OutV) {
452  OutV.Gen(LambdaV.Len());
453  for (int i = 0; i < LambdaV.Len(); i++) {
454  OutV[i] = exp(- LambdaV[i]);
455  }
456 }
457 
458 // Remove empty communities.
460  int DelCnt = 0;
461  for (int c = 0; c < CIDNSetV.Len(); c++) {
462  if (CIDNSetV[c].Len() == 0) {
463  CIDNSetV[c] = CIDNSetV.Last();
464  CIDNSetV.DelLast();
465  LambdaV[c] = LambdaV.Last();
466  LambdaV.DelLast();
467  DelCnt++;
468  c--;
469  }
470  }
471  return DelCnt;
472 }
473 
474 // Compute the change in likelihood (Delta) if node UID leaves community CID.
475 double TAGMFit::SeekLeave(const int& UID, const int& CID) {
476  IAssert(CIDNSetV[CID].IsKey(UID));
477  IAssert(G->IsNode(UID));
478  double Delta = 0.0;
479  TUNGraph::TNodeI NI = G->GetNI(UID);
480  int NbhsInC = 0;
481  for (int e = 0; e < NI.GetDeg(); e++) {
482  const int VID = NI.GetNbrNId(e);
483  if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; }
484  TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
485  TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
486  double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom);
487  CurPuv = 1 - exp(- LambdaSum);
488  NewPuv = 1 - exp(- LambdaSum + LambdaV[CID]);
489  IAssert(JointCom.Len() > 0);
490  if (JointCom.Len() == 1) {
491  NewPuv = PNoCom;
492  }
493  Delta += (log(NewPuv) - log(CurPuv));
494  IAssert(!_isnan(Delta));
495  NbhsInC++;
496  }
497  Delta += LambdaV[CID] * (CIDNSetV[CID].Len() - 1 - NbhsInC);
498  return Delta;
499 }
500 
501 // Compute the change in likelihood (Delta) if node UID joins community CID.
502 double TAGMFit::SeekJoin(const int& UID, const int& CID) {
503  IAssert(! CIDNSetV[CID].IsKey(UID));
504  double Delta = 0.0;
505  TUNGraph::TNodeI NI = G->GetNI(UID);
506  int NbhsInC = 0;
507  for (int e = 0; e < NI.GetDeg(); e++) {
508  const int VID = NI.GetNbrNId(e);
509  if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; }
510  TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
511  TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
512  double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom);
513  CurPuv = 1 - exp(- LambdaSum);
514  if (JointCom.Len() == 0) { CurPuv = PNoCom; }
515  NewPuv = 1 - exp(- LambdaSum - LambdaV[CID]);
516  Delta += (log(NewPuv) - log(CurPuv));
517  IAssert(!_isnan(Delta));
518  NbhsInC++;
519  }
520  Delta -= LambdaV[CID] * (CIDNSetV[CID].Len() - NbhsInC);
521  return Delta;
522 }
523 
524 // Compute the change in likelihood (Delta) if node UID switches from CurCID to NewCID.
525 double TAGMFit::SeekSwitch(const int& UID, const int& CurCID, const int& NewCID) {
526  IAssert(! CIDNSetV[NewCID].IsKey(UID));
527  IAssert(CIDNSetV[CurCID].IsKey(UID));
528  double Delta = SeekJoin(UID, NewCID) + SeekLeave(UID, CurCID);
529  //correct only for intersection between new com and current com
530  TUNGraph::TNodeI NI = G->GetNI(UID);
531  for (int e = 0; e < NI.GetDeg(); e++) {
532  const int VID = NI.GetNbrNId(e);
533  if (! NIDComVH.GetDat(VID).IsKey(CurCID) || ! NIDComVH.GetDat(VID).IsKey(NewCID)) {continue;}
534  TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
535  TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
536  double CurPuv, NewPuvAfterJoin, NewPuvAfterLeave, NewPuvAfterSwitch, LambdaSum = SelectLambdaSum(JointCom);
537  CurPuv = 1 - exp(- LambdaSum);
538  NewPuvAfterLeave = 1 - exp(- LambdaSum + LambdaV[CurCID]);
539  NewPuvAfterJoin = 1 - exp(- LambdaSum - LambdaV[NewCID]);
540  NewPuvAfterSwitch = 1 - exp(- LambdaSum - LambdaV[NewCID] + LambdaV[CurCID]);
541  if (JointCom.Len() == 1 || NewPuvAfterLeave == 0.0) {
542  NewPuvAfterLeave = PNoCom;
543  }
544  Delta += (log(NewPuvAfterSwitch) + log(CurPuv) - log(NewPuvAfterLeave) - log(NewPuvAfterJoin));
545  if (_isnan(Delta)) {
546  printf("NS:%f C:%f NL:%f NJ:%f PNoCom:%f", NewPuvAfterSwitch, CurPuv, NewPuvAfterLeave, NewPuvAfterJoin, PNoCom.Val);
547  }
548  IAssert(!_isnan(Delta));
549  }
550  return Delta;
551 }
552 
553 // Get communities whose p_c is higher than 1 - QMax.
554 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, const double QMax) {
555  TFltV TmpQV;
556  GetCmtyVV(CmtyVV, TmpQV, QMax);
557 }
558 
559 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, TFltV& QV, const double QMax) {
560  CmtyVV.Gen(CIDNSetV.Len(), 0);
561  QV.Gen(CIDNSetV.Len(), 0);
562  TIntFltH CIDLambdaH(CIDNSetV.Len());
563  for (int c = 0; c < CIDNSetV.Len(); c++) {
564  CIDLambdaH.AddDat(c, LambdaV[c]);
565  }
566  CIDLambdaH.SortByDat(false);
567  for (int c = 0; c < CIDNSetV.Len(); c++) {
568  int CID = CIDLambdaH.GetKey(c);
569  IAssert(LambdaV[CID] >= MinLambda);
570  double Q = exp( - (double) LambdaV[CID]);
571  if (Q > QMax) { continue; }
572  TIntV CmtyV;
573  CIDNSetV[CID].GetKeyV(CmtyV);
574  if (CmtyV.Len() == 0) { continue; }
575  if (CID == BaseCID) { //if the community is the base community(epsilon community), discard
576  IAssert(CmtyV.Len() == G->GetNodes());
577  } else {
578  CmtyVV.Add(CmtyV);
579  QV.Add(Q);
580  }
581  }
582 }
583 
584 void TAGMFit::SetCmtyVV(const TVec<TIntV>& CmtyVV) {
585  CIDNSetV.Gen(CmtyVV.Len());
586  for (int c = 0; c < CIDNSetV.Len(); c++) {
587  CIDNSetV[c].AddKeyV(CmtyVV[c]);
588  for (int j = 0; j < CmtyVV[c].Len(); j++) { IAssert(G->IsNode(CmtyVV[c][j])); } // check whether the member nodes exist in the graph
589  }
590  InitNodeData();
592 }
593 
594 // Gradient of likelihood for P_c.
596  GradV.Gen(LambdaV.Len());
597  TFltV SumEdgeProbsV(LambdaV.Len());
598  for (int e = 0; e < EdgeComVH.Len(); e++) {
599  TIntSet& JointCom = EdgeComVH[e];
600  double LambdaSum = SelectLambdaSum(JointCom);
601  double Puv = 1 - exp(- LambdaSum);
602  if (JointCom.Len() == 0) { Puv = PNoCom; }
603  for (TIntSet::TIter SI = JointCom.BegI(); SI < JointCom.EndI(); SI++) {
604  SumEdgeProbsV[SI.GetKey()] += (1 - Puv) / Puv;
605  }
606  }
607  for (int k = 0; k < LambdaV.Len(); k++) {
608  int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2;
609  int NotEdgesInCom = MaxEk - ComEdgesV[k];
610  GradV[k] = SumEdgeProbsV[k] - (double) NotEdgesInCom;
611  if (LambdaV[k] > 0.0 && RegCoef > 0.0) { //if regularization exists
612  GradV[k] -= RegCoef;
613  }
614  }
615 }
616 
617 // Compute sum of lambda_c (which is log (1 - p_c)) over C_uv (ComK). It is used to compute edge probability P_uv.
618 double TAGMFit::SelectLambdaSum(const TIntSet& ComK) {
619  return SelectLambdaSum(LambdaV, ComK);
620 }
621 
622 double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) {
623  double Result = 0.0;
624  for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) {
625  IAssert(NewLambdaV[SI.GetKey()] >= 0);
626  Result += NewLambdaV[SI.GetKey()];
627  }
628  return Result;
629 }
630 
631 // Compute the empirical edge probability between a pair of nodes who share no community (epsilon), based on current community affiliations.
632 double TAGMFit::CalcPNoComByCmtyVV(const int& SamplePairs) {
633  TIntV NIdV;
634  G->GetNIdV(NIdV);
635  uint64 PairNoCom = 0, EdgesNoCom = 0;
636  for (int u = 0; u < NIdV.Len(); u++) {
637  for (int v = u + 1; v < NIdV.Len(); v++) {
638  int SrcNID = NIdV[u], DstNID = NIdV[v];
639  TIntSet JointCom;
640  TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID),NIDComVH.GetDat(DstNID),JointCom);
641  if(JointCom.Len() == 0) {
642  PairNoCom++;
643  if (G->IsEdge(SrcNID, DstNID)) { EdgesNoCom++; }
644  if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
645  }
646  }
647  if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
648  }
649  double DefaultVal = 1.0 / (double)G->GetNodes() / (double)G->GetNodes();
650  if (EdgesNoCom > 0) {
651  PNoCom = (double) EdgesNoCom / (double) PairNoCom;
652  } else {
653  PNoCom = DefaultVal;
654  }
655  printf("%s / %s edges without joint com detected (PNoCom = %f)\n", TUInt64::GetStr(EdgesNoCom).CStr(), TUInt64::GetStr(PairNoCom).CStr(), PNoCom.Val);
656  return PNoCom;
657 }
658 
660  TIntFltH CIDLambdaH(CIDNSetV.Len());
661  for (int c = 0; c < CIDNSetV.Len(); c++) {
662  CIDLambdaH.AddDat(c, LambdaV[c]);
663  }
664  CIDLambdaH.SortByDat(false);
665  int Coms = 0;
666  for (int i = 0; i < LambdaV.Len(); i++) {
667  int CID = CIDLambdaH.GetKey(i);
668  if (LambdaV[CID] <= 0.0001) { continue; }
669  printf("P_c : %.3f Com Sz: %d, Total Edges inside: %d \n", 1.0 - exp(- LambdaV[CID]), CIDNSetV[CID].Len(), (int) ComEdgesV[CID]);
670  Coms++;
671  }
672  printf("%d Communities, Total Memberships = %d, Likelihood = %.2f, Epsilon = %f\n", Coms, NIDCIDPrS.Len(), Likelihood(), PNoCom.Val);
673 }
#define IAssert(Cond)
Definition: bd.h:262
static const T & Mn(const T &LVal, const T &RVal)
Definition: xmath.h:36
void Load(TSIn &SIn, const int &RndSeed=0)
Definition: agmfit.cpp:24
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
static void GetNbhCom(const PUNGraph &Graph, const int NID, TIntSet &NBCmtyS)
Definition: agm.cpp:706
static double Norm(const TFltV &x)
Definition: linalg.cpp:324
TPair< TFlt, TInt > TFltIntPr
Definition: ds.h:97
int Len() const
Definition: dt.h:487
Definition: tm.h:355
TRnd Rnd
Definition: agmfit.h:17
#define Kilo(n)
Definition: gbase.h:3
double SeekSwitch(const int &UID, const int &CurCID, const int &NewCID)
Definition: agmfit.cpp:525
static double SumVec(const TFltV &x)
Definition: linalg.cpp:279
static void GetNodeMembership(THash< TInt, TIntSet > &NIDComVH, const TVec< TIntV > &CmtyVV)
get hash table of
Definition: agm.cpp:335
void SavePng(const int &SizeX=1000, const int &SizeY=800, const TStr &Comment=TStr())
Definition: gnuplot.h:120
static const T & Mx(const T &LVal, const T &RVal)
Definition: xmath.h:32
void SetDefaultPNoCom()
Set Epsilon (the probability that two nodes sharing no communities connect) to the default value...
Definition: agmfit.cpp:72
void Save(TSOut &SOut) const
Definition: dt.h:1150
THash< TIntPr, TInt > NIDCIDPrS
pairs (for sampling MCMC moves).
Definition: agmfit.h:19
TIter BegI() const
Definition: shash.h:1105
void GetNIdV(TIntV &NIdV) const
Gets a vector IDs of all nodes in the graph.
Definition: graph.cpp:153
void SetTitle(const TStr &PlotTitle)
Definition: gnuplot.h:70
double GetStepSizeByLineSearchForLambda(const TFltV &DeltaV, const TFltV &GradV, const double &Alpha, const double &Beta)
Step size search for updating P_c (which is parametarized by regularization parameter lambda)...
Definition: agmfit.cpp:109
double Likelihood()
COMMENT.
Definition: agmfit.cpp:104
double Val
Definition: dt.h:1385
int GetEdges() const
Returns the number of edges in the graph.
Definition: graph.cpp:82
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Node iterator. Only forward iteration (operator++) is supported.
Definition: graph.h:68
TFltV LambdaV
Parametrization of P_c (edge probability in community c), P_c = 1 - exp(-lambda). ...
Definition: agmfit.h:16
void Save(TSOut &SOut) const
Definition: hash.h:183
void PrintSummary()
COMMENT.
Definition: agmfit.cpp:659
TInt BaseCID
ID of the Epsilon-community (in case we fit P_c of the epsilon community). We do not fit for the Epsi...
Definition: agmfit.h:23
static void GetIntersection(const THashSet< TInt > &A, const THashSet< TInt > &B, THashSet< TInt > &C)
Definition: agm.cpp:404
void Save(TSOut &SOut) const
Saves the graph to a (binary) stream SOut.
Definition: graph.h:170
void SetDataPlotFNm(const TStr &DatFNm, const TStr &PltFNm)
Definition: gnuplot.h:74
THash< TIntPr, TIntSet > EdgeComVH
Edge -> Shared Community ID Set.
Definition: agmfit.h:12
void Load(TSIn &SIn)
Definition: ds.h:946
static double GetConductance(const PUNGraph &Graph, const TIntSet &CmtyS, const int Edges)
Definition: agm.cpp:683
void GetEdgeJointCom()
For each (u, v) in edges, precompute C_uv (the set of communities nodes u and v share).
Definition: agmfit.cpp:50
int GetNodes() const
Returns the number of nodes in the graph.
Definition: graph.h:192
void Load(TSIn &SIn)
Definition: hash.h:177
Definition: fl.h:58
void Save(TSOut &SOut) const
Definition: ds.h:954
int GetRndNId(TRnd &Rnd=TInt::Rnd)
Returns an ID of a random node in the graph.
Definition: graph.h:285
TPair< TInt, TFlt > TIntFltPr
Definition: ds.h:87
int GetDeg() const
Returns degree of the current node.
Definition: graph.h:90
const char * GetTmStr() const
Definition: tm.h:370
void GradLogLForLambda(TFltV &GradV)
Definition: agmfit.cpp:595
void DelKey(const TKey &Key)
Definition: hash.h:404
double CalcPNoComByCmtyVV(const int &SamplePairs=-1)
Compute the empirical edge probability between a pair of nodes who share no community (epsilon)...
Definition: agmfit.cpp:632
void Save(TSOut &SOut)
Definition: agmfit.cpp:8
void LeaveCom(const int &NID, const int &CID)
After MCMC, NID leaves community CID.
Definition: agmfit.cpp:293
TFlt MinLambda
Minimum value of regularization parameter lambda (default = 1e-5).
Definition: agmfit.h:20
void GetCmtyVV(TVec< TIntV > &CmtyVV, const double QMax=2.0)
Get communities whose p_c is higher than 1 - QMax.
Definition: agmfit.cpp:554
int RemoveEmptyCom()
Remove all communities with no members.
Definition: agmfit.cpp:459
int MLEGradAscentGivenCAG(const double &Thres=0.001, const int &MaxIter=10000, const TStr PlotNm=TStr())
Gradient descent for p_c while keeping the community affiliation graph (CAG) fixed.
Definition: agmfit.cpp:130
void Gen(const int &ExpectVals)
Definition: hash.h:222
THash< TInt, TIntSet > NIDComVH
Node ID -> Communitie IDs the node belongs to.
Definition: agmfit.h:13
unsigned long long uint64
Definition: bd.h:38
double SelectLambdaSum(const TIntSet &ComK)
Compute sum of lambda_c (which is log (1 - p_c)) over C_uv (ComK). The function is used to compute ed...
Definition: agmfit.cpp:618
static void GenCmtyVVFromPL(TVec< TIntV > &CmtyVV, const PUNGraph &Graph, const int &Nodes, const int &Coms, const double &ComSzAlpha, const double &MemAlpha, const int &MinSz, const int &MaxSz, const int &MinK, const int &MaxK, TRnd &Rnd)
Generate bipartite community affiliation from given power law coefficients for membership distributio...
Definition: agm.cpp:101
TFlt RegCoef
Regularization parameter when we fit for P_c (for finding # communities).
Definition: agmfit.h:22
TNodeI GetNI(const int &NId) const
Returns an iterator referring to the node of ID NId in the graph.
Definition: graph.h:241
void Load(TSIn &SIn)
Definition: dt.h:1149
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
void NeighborComInit(const int InitComs)
Initialize node community memberships using best neighborhood communities (see D. Gleich et al...
Definition: agmfit.cpp:186
void GetQV(TFltV &OutV)
Returns QV, a vector of (1 - p_c) for each community c.
Definition: agmfit.cpp:451
TNodeI GetRndNI(TRnd &Rnd=TInt::Rnd)
Returns an interator referring to a random node in the graph.
Definition: graph.h:287
void Tick()
Definition: tm.h:364
Definition: fl.h:128
static PUNGraph Load(TSIn &SIn)
Static constructor that loads the graph from a stream SIn and returns a pointer to it...
Definition: graph.h:178
TIter EndI() const
Definition: shash.h:1112
int Len() const
Definition: shash.h:1121
Definition: ds.h:32
void PutSeed(const int &_Seed)
Definition: dt.cpp:18
int AddKey(const TKey &Key)
Definition: hash.h:373
double SeekJoin(const int &UID, const int &CID)
Compute the change in likelihood (Delta) if node UID joins community CID.
Definition: agmfit.cpp:502
TStr GetStr() const
Definition: dt.h:1360
void RunMCMC(const int &MaxIter, const int &EvalLambdaIter, const TStr &PlotFPrx=TStr())
Main procedure for fitting the AGM to a given graph using MCMC.
Definition: agmfit.cpp:366
Definition: dt.h:412
bool Empty() const
Definition: dt.h:488
void InitNodeData()
COMMENT.
Definition: agmfit.cpp:264
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
double SeekLeave(const int &UID, const int &CID)
Compute the change in likelihood (Delta) if node UID leaves community CID.
Definition: agmfit.cpp:475
void JoinCom(const int &NID, const int &JoinCID)
Definition: agmfit.cpp:309
void RandomInitCmtyVV(const int InitComs, const double ComSzAlpha=1.3, const double MemAlpha=1.8, const int MinComSz=8, const int MaxComSz=200, const int MinMem=1, const int MaxMem=10)
Randomly initialize bipartite community affiliation graph.
Definition: agmfit.cpp:42
void Load(TSIn &SIn)
Definition: dt.h:1402
TFlt MaxLambda
Maximum value of regularization parameter lambda (default = 10).
Definition: agmfit.h:21
TVec< TIntSet > CIDNSetV
Community ID -> Member Node ID Sets.
Definition: agmfit.h:11
TNodeI EndNI() const
Returns an iterator referring to the past-the-end node in the graph.
Definition: graph.h:239
double GetUniDev()
Definition: dt.h:30
void AddBaseCmty()
Add Epsilon community (base community which includes all nodes) into community affiliation graph...
Definition: agmfit.cpp:251
void SetCmtyVV(const TVec< TIntV > &CmtyVV)
COMMENT.
Definition: agmfit.cpp:584
TVal1 Val1
Definition: ds.h:34
int AddPlot(const TIntV &YValV, const TGpSeriesTy &SeriesTy=gpwLinesPoints, const TStr &Label=TStr(), const TStr &Style=TStr())
Definition: gnuplot.cpp:186
int GetNbrNId(const int &NodeN) const
Returns ID of NodeN-th neighboring node.
Definition: graph.h:111
TVal2 Val2
Definition: ds.h:35
bool IsNode(const int &NId) const
Tests whether ID NId is a node.
Definition: graph.h:235
static double DotProduct(const TFltV &x, const TFltV &y)
Definition: linalg.cpp:165
int GetRndKeyId(TRnd &Rnd) const
Get an index of a random element. If the hash table has many deleted keys, this may take a long time...
Definition: hash.h:444
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:523
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int GetId() const
Returns ID of the current node.
Definition: graph.h:88
bool IsEdge(const int &SrcNId, const int &DstNId) const
Tests whether an edge between node IDs SrcNId and DstNId exists in the graph.
Definition: graph.cpp:137
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
TNodeI BegNI() const
Returns an iterator referring to the first node in the graph.
Definition: graph.h:237
void DelSelfEdges(const PGraph &Graph)
Removes all the self-edges from the graph.
Definition: alg.h:419
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
int Len() const
Definition: hash.h:228
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static void PlotValV(const TVec< TVal1 > &ValV, const TStr &OutFNmPref, const TStr &Desc="", const TStr &XLabel="", const TStr &YLabel="", const TGpScaleTy &ScaleTy=gpsAuto, const bool &PowerFit=false, const TGpSeriesTy &SeriesTy=gpwLinesPoints)
Definition: gnuplot.h:398
PUNGraph G
Graph to fit.
Definition: agmfit.h:10
TIntV ComEdgesV
The number of edges in each community.
Definition: agmfit.h:14
void SampleTransition(int &NID, int &JoinCID, int &LeaveCID, double &DeltaL)
Sample MMCM transitions: Choose among (join, leave, switch), and then sample (NID, CID).
Definition: agmfit.cpp:325
void Save(TSOut &SOut) const
Definition: dt.h:1399
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:252
TFlt PNoCom
Probability of edge when two nodes share no community (epsilon in the paper).
Definition: agmfit.h:15
void RandomInit(const int &MaxK)
COMMENT.
Definition: agmfit.cpp:169
THash< TIntPr, TFlt > NIDCIDPrH
pairs (for sampling MCMC moves).
Definition: agmfit.h:18
static const double Mn
Definition: dt.h:1387