SNAP Library 6.0, User Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
agmfit.cpp
Go to the documentation of this file.
1 #include "stdafx.h"
2 #include "agmfit.h"
3 #include "agm.h"
4 
6 // AGM fitting
7 
8 void TAGMFit::Save(TSOut& SOut) {
9  G->Save(SOut);
10  CIDNSetV.Save(SOut);
11  EdgeComVH.Save(SOut);
12  NIDComVH.Save(SOut);
13  ComEdgesV.Save(SOut);
14  PNoCom.Save(SOut);
15  LambdaV.Save(SOut);
16  NIDCIDPrH.Save(SOut);
17  NIDCIDPrS.Save(SOut);
18  MinLambda.Save(SOut);
19  MaxLambda.Save(SOut);
20  RegCoef.Save(SOut);
21  BaseCID.Save(SOut);
22 }
23 
24 void TAGMFit::Load(TSIn& SIn, const int& RndSeed) {
25  G = TUNGraph::Load(SIn);
26  CIDNSetV.Load(SIn);
27  EdgeComVH.Load(SIn);
28  NIDComVH.Load(SIn);
29  ComEdgesV.Load(SIn);
30  PNoCom.Load(SIn);
31  LambdaV.Load(SIn);
32  NIDCIDPrH.Load(SIn);
33  NIDCIDPrS.Load(SIn);
34  MinLambda.Load(SIn);
35  MaxLambda.Load(SIn);
36  RegCoef.Load(SIn);
37  BaseCID.Load(SIn);
38  Rnd.PutSeed(RndSeed);
39 }
40 
41 // Randomly initialize bipartite community affiliation graph.
42 void TAGMFit::RandomInitCmtyVV(const int InitComs, const double ComSzAlpha, const double MemAlpha, const int MinComSz, const int MaxComSz, const int MinMem, const int MaxMem) {
43  TVec<TIntV> InitCmtyVV(InitComs, 0);
44  TAGMUtil::GenCmtyVVFromPL(InitCmtyVV, G, G->GetNodes(), InitComs, ComSzAlpha, MemAlpha, MinComSz, MaxComSz,
45  MinMem, MaxMem, Rnd);
46  SetCmtyVV(InitCmtyVV);
47 }
48 
49 // For each (u, v) in edges, precompute C_uv (the set of communities u and v share).
51  ComEdgesV.Gen(CIDNSetV.Len());
52  EdgeComVH.Gen(G->GetEdges());
53  for (TUNGraph::TNodeI SrcNI = G->BegNI(); SrcNI < G->EndNI(); SrcNI++) {
54  int SrcNID = SrcNI.GetId();
55  for (int v = 0; v < SrcNI.GetDeg(); v++) {
56  int DstNID = SrcNI.GetNbrNId(v);
57  if (SrcNID >= DstNID) { continue; }
58  TIntSet JointCom;
59  IAssert(NIDComVH.IsKey(SrcNID));
60  IAssert(NIDComVH.IsKey(DstNID));
61  TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID), NIDComVH.GetDat(DstNID), JointCom);
62  EdgeComVH.AddDat(TIntPr(SrcNID,DstNID),JointCom);
63  for (int k = 0; k < JointCom.Len(); k++) {
64  ComEdgesV[JointCom[k]]++;
65  }
66  }
67  }
68  IAssert(EdgeComVH.Len() == G->GetEdges());
69 }
70 
71 // Set epsilon by the default value.
73  PNoCom = 1.0 / (double) G->GetNodes() / (double) G->GetNodes();
74 }
75 
76 double TAGMFit::Likelihood(const TFltV& NewLambdaV, double& LEdges, double& LNoEdges) {
77  IAssert(CIDNSetV.Len() == NewLambdaV.Len());
78  IAssert(ComEdgesV.Len() == CIDNSetV.Len());
79  LEdges = 0.0; LNoEdges = 0.0;
80  for (int e = 0; e < EdgeComVH.Len(); e++) {
81  TIntSet& JointCom = EdgeComVH[e];
82  double LambdaSum = SelectLambdaSum(NewLambdaV, JointCom);
83  double Puv = 1 - exp(- LambdaSum);
84  if (JointCom.Len() == 0) { Puv = PNoCom; }
85  IAssert(! _isnan(log(Puv)));
86  LEdges += log(Puv);
87  }
88  for (int k = 0; k < NewLambdaV.Len(); k++) {
89  int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2;
90  int NotEdgesInCom = MaxEk - ComEdgesV[k];
91  if(NotEdgesInCom > 0) {
92  if (LNoEdges >= TFlt::Mn + double(NotEdgesInCom) * NewLambdaV[k]) {
93  LNoEdges -= double(NotEdgesInCom) * NewLambdaV[k];
94  }
95  }
96  }
97  double LReg = 0.0;
98  if (RegCoef > 0.0) {
99  LReg = - RegCoef * TLinAlg::SumVec(NewLambdaV);
100  }
101  return LEdges + LNoEdges + LReg;
102 }
103 
105  return Likelihood(LambdaV);
106 }
107 
108 // Step size search for updating P_c (which is parametarized by lambda).
109 double TAGMFit::GetStepSizeByLineSearchForLambda(const TFltV& DeltaV, const TFltV& GradV, const double& Alpha, const double& Beta) {
110  double StepSize = 1.0;
111  double InitLikelihood = Likelihood();
112  IAssert(LambdaV.Len() == DeltaV.Len());
113  TFltV NewLambdaV(LambdaV.Len());
114  for (int iter = 0; ; iter++) {
115  for (int i = 0; i < LambdaV.Len(); i++) {
116  NewLambdaV[i] = LambdaV[i] + StepSize * DeltaV[i];
117  if (NewLambdaV[i] < MinLambda) { NewLambdaV[i] = MinLambda; }
118  if (NewLambdaV[i] > MaxLambda) { NewLambdaV[i] = MaxLambda; }
119  }
120  if (Likelihood(NewLambdaV) < InitLikelihood + Alpha * StepSize * TLinAlg::DotProduct(GradV, DeltaV)) {
121  StepSize *= Beta;
122  } else {
123  break;
124  }
125  }
126  return StepSize;
127 }
128 
129 // Gradient descent for p_c while fixing community affiliation graph (CAG).
130 int TAGMFit::MLEGradAscentGivenCAG(const double& Thres, const int& MaxIter, const TStr PlotNm) {
131  int Edges = G->GetEdges();
132  TExeTm ExeTm;
133  TFltV GradV(LambdaV.Len());
134  int iter = 0;
135  TIntFltPrV IterLV, IterGradNormV;
136  double GradCutOff = 1000;
137  for (iter = 0; iter < MaxIter; iter++) {
138  GradLogLForLambda(GradV); //if gradient is going out of the boundary, cut off
139  for (int i = 0; i < LambdaV.Len(); i++) {
140  if (GradV[i] < -GradCutOff) { GradV[i] = -GradCutOff; }
141  if (GradV[i] > GradCutOff) { GradV[i] = GradCutOff; }
142  if (LambdaV[i] <= MinLambda && GradV[i] < 0) { GradV[i] = 0.0; }
143  if (LambdaV[i] >= MaxLambda && GradV[i] > 0) { GradV[i] = 0.0; }
144  }
145  double Alpha = 0.15, Beta = 0.2;
146  if (Edges > Kilo(100)) { Alpha = 0.00015; Beta = 0.3;}
147  double LearnRate = GetStepSizeByLineSearchForLambda(GradV, GradV, Alpha, Beta);
148  if (TLinAlg::Norm(GradV) < Thres) { break; }
149  for (int i = 0; i < LambdaV.Len(); i++) {
150  double Change = LearnRate * GradV[i];
151  LambdaV[i] += Change;
152  if(LambdaV[i] < MinLambda) { LambdaV[i] = MinLambda;}
153  if(LambdaV[i] > MaxLambda) { LambdaV[i] = MaxLambda;}
154  }
155  if (! PlotNm.Empty()) {
156  double L = Likelihood();
157  IterLV.Add(TIntFltPr(iter, L));
158  IterGradNormV.Add(TIntFltPr(iter, TLinAlg::Norm(GradV)));
159  }
160  }
161  if (! PlotNm.Empty()) {
162  TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
163  TGnuPlot::PlotValV(IterGradNormV, PlotNm + ".gradnorm_Q");
164  printf("MLE for Lambda completed with %d iterations(%s)\n",iter,ExeTm.GetTmStr());
165  }
166  return iter;
167 }
168 
169 void TAGMFit::RandomInit(const int& MaxK) {
170  CIDNSetV.Clr();
171  for (int c = 0; c < MaxK; c++) {
172  CIDNSetV.Add();
173  int NC = Rnd.GetUniDevInt(G -> GetNodes());
174  TUNGraph::TNodeI NI = G -> GetRndNI();
175  CIDNSetV.Last().AddKey(NI.GetId());
176  for (int v = 0; v < NC; v++) {
177  NI = G->GetNI(NI.GetNbrNId(Rnd.GetUniDevInt(NI.GetDeg())));
178  CIDNSetV.Last().AddKey(NI.GetId());
179  }
180  }
181  InitNodeData();
183 }
184 
185 // Initialize node community memberships using best neighborhood communities (see D. Gleich et al. KDD'12).
186 void TAGMFit::NeighborComInit(const int InitComs) {
187  CIDNSetV.Gen(InitComs);
188  const int Edges = G->GetEdges();
189  TFltIntPrV NIdPhiV(G->GetNodes(), 0);
190  TIntSet InvalidNIDS(G->GetNodes());
191  TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG
192  TExeTm RunTm;
193  //compute conductance of neighborhood community
194  TIntV NIdV;
195  G->GetNIdV(NIdV);
196  for (int u = 0; u < NIdV.Len(); u++) {
197  TIntSet NBCmty(G->GetNI(NIdV[u]).GetDeg() + 1);
198  double Phi;
199  if (G->GetNI(NIdV[u]).GetDeg() < 5) { //do not include nodes with too few degree
200  Phi = 1.0;
201  } else {
202  TAGMUtil::GetNbhCom(G, NIdV[u], NBCmty);
203  IAssert(NBCmty.Len() == G->GetNI(NIdV[u]).GetDeg() + 1);
204  Phi = TAGMUtil::GetConductance(G, NBCmty, Edges);
205  }
206  NIdPhiV.Add(TFltIntPr(Phi, NIdV[u]));
207  }
208  NIdPhiV.Sort(true);
209  printf("conductance computation completed [%s]\n", RunTm.GetTmStr());
210  fflush(stdout);
211  //choose nodes with local minimum in conductance
212  int CurCID = 0;
213  for (int ui = 0; ui < NIdPhiV.Len(); ui++) {
214  int UID = NIdPhiV[ui].Val2;
215  fflush(stdout);
216  if (InvalidNIDS.IsKey(UID)) { continue; }
217  ChosenNIDV.Add(UID); //FOR DEBUG
218  //add the node and its neighbors to the current community
219  CIDNSetV[CurCID].AddKey(UID);
220  TUNGraph::TNodeI NI = G->GetNI(UID);
221  fflush(stdout);
222  for (int e = 0; e < NI.GetDeg(); e++) {
223  CIDNSetV[CurCID].AddKey(NI.GetNbrNId(e));
224  }
225  //exclude its neighbors from the next considerations
226  for (int e = 0; e < NI.GetDeg(); e++) {
227  InvalidNIDS.AddKey(NI.GetNbrNId(e));
228  }
229  CurCID++;
230  fflush(stdout);
231  if (CurCID >= InitComs) { break; }
232  }
233  if (InitComs > CurCID) {
234  printf("%d communities needed to fill randomly\n", InitComs - CurCID);
235  }
236  //assign a member to zero-member community (if any)
237  for (int c = 0; c < CIDNSetV.Len(); c++) {
238  if (CIDNSetV[c].Len() == 0) {
239  int ComSz = 10;
240  for (int u = 0; u < ComSz; u++) {
241  int UID = G->GetRndNI().GetId();
242  CIDNSetV[c].AddKey(UID);
243  }
244  }
245  }
246  InitNodeData();
248 }
249 
250 // Add epsilon community (base community which includes all nodes) into community affiliation graph. It means that we fit for epsilon.
252  TVec<TIntV> CmtyVV;
253  GetCmtyVV(CmtyVV);
254  TIntV TmpV = CmtyVV[0];
255  CmtyVV.Add(TmpV);
256  G->GetNIdV(CmtyVV[0]);
257  IAssert(CIDNSetV.Len() + 1 == CmtyVV.Len());
258  SetCmtyVV(CmtyVV);
259  InitNodeData();
260  BaseCID = 0;
261  PNoCom = 0.0;
262 }
263 
266  NIDComVH.Gen(G->GetNodes());
267  for (TUNGraph::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) {
268  NIDComVH.AddDat(NI.GetId());
269  }
271  GetEdgeJointCom();
272  LambdaV.Gen(CIDNSetV.Len());
273  for (int c = 0; c < CIDNSetV.Len(); c++) {
274  int MaxE = (CIDNSetV[c].Len()) * (CIDNSetV[c].Len() - 1) / 2;
275  if (MaxE < 2) {
276  LambdaV[c] = MaxLambda;
277  }
278  else{
279  LambdaV[c] = -log((double) (MaxE - ComEdgesV[c]) / MaxE);
280  }
281  if (LambdaV[c] > MaxLambda) { LambdaV[c] = MaxLambda; }
282  if (LambdaV[c] < MinLambda) { LambdaV[c] = MinLambda; }
283  }
284  NIDCIDPrS.Gen(G->GetNodes() * 10);
285  for (int c = 0; c < CIDNSetV.Len(); c++) {
286  for (TIntSet::TIter SI = CIDNSetV[c].BegI(); SI < CIDNSetV[c].EndI(); SI++) {
287  NIDCIDPrS.AddKey(TIntPr(SI.GetKey(), c));
288  }
289  }
290 }
291 
292 // After MCMC, NID leaves community CID.
293 void TAGMFit::LeaveCom(const int& NID, const int& CID) {
294  TUNGraph::TNodeI NI = G->GetNI(NID);
295  for (int e = 0; e < NI.GetDeg(); e++) {
296  int VID = NI.GetNbrNId(e);
297  if (NIDComVH.GetDat(VID).IsKey(CID)) {
298  TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID));
299  EdgeComVH.GetDat(SrcDstNIDPr).DelKey(CID);
300  ComEdgesV[CID]--;
301  }
302  }
303  CIDNSetV[CID].DelKey(NID);
304  NIDComVH.GetDat(NID).DelKey(CID);
305  NIDCIDPrS.DelKey(TIntPr(NID, CID));
306 }
307 
308 // After MCMC, NID joins community CID.
309 void TAGMFit::JoinCom(const int& NID, const int& JoinCID) {
310  TUNGraph::TNodeI NI = G->GetNI(NID);
311  for (int e = 0; e < NI.GetDeg(); e++) {
312  int VID = NI.GetNbrNId(e);
313  if (NIDComVH.GetDat(VID).IsKey(JoinCID)) {
314  TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID));
315  EdgeComVH.GetDat(SrcDstNIDPr).AddKey(JoinCID);
316  ComEdgesV[JoinCID]++;
317  }
318  }
319  CIDNSetV[JoinCID].AddKey(NID);
320  NIDComVH.GetDat(NID).AddKey(JoinCID);
321  NIDCIDPrS.AddKey(TIntPr(NID, JoinCID));
322 }
323 
324 // Sample transition: Choose among (join, leave, switch), and then sample (NID, CID).
325 void TAGMFit::SampleTransition(int& NID, int& JoinCID, int& LeaveCID, double& DeltaL) {
326  int Option = Rnd.GetUniDevInt(3); //0:Join 1:Leave 2:Switch
327  if (NIDCIDPrS.Len() <= 1) { Option = 0; } //if there is only one node membership, only join is possible.
328  int TryCnt = 0;
329  const int MaxTryCnt = G->GetNodes();
330  DeltaL = TFlt::Mn;
331  if (Option == 0) {
332  do {
333  JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len());
334  NID = G->GetRndNId();
335  } while (TryCnt++ < MaxTryCnt && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID)));
336  if (TryCnt < MaxTryCnt) { //if successfully find a move
337  DeltaL = SeekJoin(NID, JoinCID);
338  }
339  }
340  else if (Option == 1) {
341  do {
343  NID = NIDCIDPr.Val1;
344  LeaveCID = NIDCIDPr.Val2;
345  } while (TryCnt++ < MaxTryCnt && LeaveCID == BaseCID);
346  if (TryCnt < MaxTryCnt) {//if successfully find a move
347  DeltaL = SeekLeave(NID, LeaveCID);
348  }
349  }
350  else{
351  do {
353  NID = NIDCIDPr.Val1;
354  LeaveCID = NIDCIDPr.Val2;
355  } while (TryCnt++ < MaxTryCnt && (NIDComVH.GetDat(NID).Len() == CIDNSetV.Len() || LeaveCID == BaseCID));
356  do {
357  JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len());
358  } while (TryCnt++ < G->GetNodes() && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID)));
359  if (TryCnt < MaxTryCnt) {//if successfully find a move
360  DeltaL = SeekSwitch(NID, LeaveCID, JoinCID);
361  }
362  }
363 }
364 
366 void TAGMFit::RunMCMC(const int& MaxIter, const int& EvalLambdaIter, const TStr& PlotFPrx) {
367  TExeTm IterTm, TotalTm;
368  double PrevL = Likelihood(), DeltaL = 0;
369  double BestL = PrevL;
370  printf("initial likelihood = %f\n",PrevL);
371  TIntFltPrV IterTrueLV, IterJoinV, IterLeaveV, IterAcceptV, IterSwitchV, IterLBV;
372  TIntPrV IterTotMemV;
373  TIntV IterV;
374  TFltV BestLV;
375  TVec<TIntSet> BestCmtySetV;
376  int SwitchCnt = 0, LeaveCnt = 0, JoinCnt = 0, AcceptCnt = 0, ProbBinSz;
377  int Nodes = G->GetNodes(), Edges = G->GetEdges();
378  TExeTm PlotTm;
379  ProbBinSz = TMath::Mx(1000, G->GetNodes() / 10); //bin to compute probabilities
380  IterLBV.Add(TIntFltPr(1, BestL));
381 
382  for (int iter = 0; iter < MaxIter; iter++) {
383  IterTm.Tick();
384  int NID = -1;
385  int JoinCID = -1, LeaveCID = -1;
386  SampleTransition(NID, JoinCID, LeaveCID, DeltaL); //sample a move
387  double OptL = PrevL;
388  if (DeltaL > 0 || Rnd.GetUniDev() < exp(DeltaL)) { //if it is accepted
389  IterTm.Tick();
390  if (LeaveCID > -1 && LeaveCID != BaseCID) { LeaveCom(NID, LeaveCID); }
391  if (JoinCID > -1 && JoinCID != BaseCID) { JoinCom(NID, JoinCID); }
392  if (LeaveCID > -1 && JoinCID > -1 && JoinCID != BaseCID && LeaveCID != BaseCID) { SwitchCnt++; }
393  else if (LeaveCID > -1 && LeaveCID != BaseCID) { LeaveCnt++;}
394  else if (JoinCID > -1 && JoinCID != BaseCID) { JoinCnt++;}
395  AcceptCnt++;
396  if ((iter + 1) % EvalLambdaIter == 0) {
397  IterTm.Tick();
398  MLEGradAscentGivenCAG(0.01, 3);
399  OptL = Likelihood();
400  }
401  else{
402  OptL = PrevL + DeltaL;
403  }
404  if (BestL <= OptL && CIDNSetV.Len() > 0) {
405  BestCmtySetV = CIDNSetV;
406  BestLV = LambdaV;
407  BestL = OptL;
408  }
409  }
410  if (iter > 0 && (iter % ProbBinSz == 0) && PlotFPrx.Len() > 0) {
411  IterLBV.Add(TIntFltPr(iter, OptL));
412  IterSwitchV.Add(TIntFltPr(iter, (double) SwitchCnt / (double) AcceptCnt));
413  IterLeaveV.Add(TIntFltPr(iter, (double) LeaveCnt / (double) AcceptCnt));
414  IterJoinV.Add(TIntFltPr(iter, (double) JoinCnt / (double) AcceptCnt));
415  IterAcceptV.Add(TIntFltPr(iter, (double) AcceptCnt / (double) ProbBinSz));
416  SwitchCnt = JoinCnt = LeaveCnt = AcceptCnt = 0;
417  }
418  PrevL = OptL;
419  if ((iter + 1) % 10000 == 0) {
420  printf("\r%d iterations completed [%.2f]", iter, (double) iter / (double) MaxIter);
421  }
422  }
423 
424  // plot the likelihood and acceptance probabilities if the plot file name is given
425  if (PlotFPrx.Len() > 0) {
426  TGnuPlot GP1;
427  GP1.AddPlot(IterLBV, gpwLinesPoints, "likelihood");
428  GP1.SetDataPlotFNm(PlotFPrx + ".likelihood.tab", PlotFPrx + ".likelihood.plt");
429  TStr TitleStr = TStr::Fmt(" N:%d E:%d", Nodes, Edges);
430  GP1.SetTitle(PlotFPrx + ".likelihood" + TitleStr);
431  GP1.SavePng(PlotFPrx + ".likelihood.png");
432 
433  TGnuPlot GP2;
434  GP2.AddPlot(IterSwitchV, gpwLinesPoints, "Switch");
435  GP2.AddPlot(IterLeaveV, gpwLinesPoints, "Leave");
436  GP2.AddPlot(IterJoinV, gpwLinesPoints, "Join");
437  GP2.AddPlot(IterAcceptV, gpwLinesPoints, "Accept");
438  GP2.SetTitle(PlotFPrx + ".transition");
439  GP2.SetDataPlotFNm(PlotFPrx + "transition_prob.tab", PlotFPrx + "transition_prob.plt");
440  GP2.SavePng(PlotFPrx + "transition_prob.png");
441  }
442  CIDNSetV = BestCmtySetV;
443  LambdaV = BestLV;
444 
445  InitNodeData();
446  MLEGradAscentGivenCAG(0.001, 100);
447  printf("\nMCMC completed (best likelihood: %.2f) [%s]\n", BestL, TotalTm.GetTmStr());
448 }
449 
450 // Returns \v QV, a vector of (1 - p_c) for each community c.
451 void TAGMFit::GetQV(TFltV& OutV) {
452  OutV.Gen(LambdaV.Len());
453  for (int i = 0; i < LambdaV.Len(); i++) {
454  OutV[i] = exp(- LambdaV[i]);
455  }
456 }
457 
458 // Remove empty communities.
460  int DelCnt = 0;
461  for (int c = 0; c < CIDNSetV.Len(); c++) {
462  if (CIDNSetV[c].Len() == 0) {
463  CIDNSetV[c] = CIDNSetV.Last();
464  CIDNSetV.DelLast();
465  LambdaV[c] = LambdaV.Last();
466  LambdaV.DelLast();
467  DelCnt++;
468  c--;
469  }
470  }
471  return DelCnt;
472 }
473 
474 // Compute the change in likelihood (Delta) if node UID leaves community CID.
475 double TAGMFit::SeekLeave(const int& UID, const int& CID) {
476  IAssert(CIDNSetV[CID].IsKey(UID));
477  IAssert(G->IsNode(UID));
478  double Delta = 0.0;
479  TUNGraph::TNodeI NI = G->GetNI(UID);
480  int NbhsInC = 0;
481  for (int e = 0; e < NI.GetDeg(); e++) {
482  const int VID = NI.GetNbrNId(e);
483  if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; }
484  TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
485  TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
486  double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom);
487  CurPuv = 1 - exp(- LambdaSum);
488  NewPuv = 1 - exp(- LambdaSum + LambdaV[CID]);
489  IAssert(JointCom.Len() > 0);
490  if (JointCom.Len() == 1) {
491  NewPuv = PNoCom;
492  }
493  Delta += (log(NewPuv) - log(CurPuv));
494  IAssert(!_isnan(Delta));
495  NbhsInC++;
496  }
497  Delta += LambdaV[CID] * (CIDNSetV[CID].Len() - 1 - NbhsInC);
498  return Delta;
499 }
500 
501 // Compute the change in likelihood (Delta) if node UID joins community CID.
502 double TAGMFit::SeekJoin(const int& UID, const int& CID) {
503  IAssert(! CIDNSetV[CID].IsKey(UID));
504  double Delta = 0.0;
505  TUNGraph::TNodeI NI = G->GetNI(UID);
506  int NbhsInC = 0;
507  for (int e = 0; e < NI.GetDeg(); e++) {
508  const int VID = NI.GetNbrNId(e);
509  if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; }
510  TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
511  TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
512  double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom);
513  CurPuv = 1 - exp(- LambdaSum);
514  if (JointCom.Len() == 0) { CurPuv = PNoCom; }
515  NewPuv = 1 - exp(- LambdaSum - LambdaV[CID]);
516  Delta += (log(NewPuv) - log(CurPuv));
517  IAssert(!_isnan(Delta));
518  NbhsInC++;
519  }
520  Delta -= LambdaV[CID] * (CIDNSetV[CID].Len() - NbhsInC);
521  return Delta;
522 }
523 
524 // Compute the change in likelihood (Delta) if node UID switches from CurCID to NewCID.
525 double TAGMFit::SeekSwitch(const int& UID, const int& CurCID, const int& NewCID) {
526  IAssert(! CIDNSetV[NewCID].IsKey(UID));
527  IAssert(CIDNSetV[CurCID].IsKey(UID));
528  double Delta = SeekJoin(UID, NewCID) + SeekLeave(UID, CurCID);
529  //correct only for intersection between new com and current com
530  TUNGraph::TNodeI NI = G->GetNI(UID);
531  for (int e = 0; e < NI.GetDeg(); e++) {
532  const int VID = NI.GetNbrNId(e);
533  if (! NIDComVH.GetDat(VID).IsKey(CurCID) || ! NIDComVH.GetDat(VID).IsKey(NewCID)) {continue;}
534  TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
535  TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
536  double CurPuv, NewPuvAfterJoin, NewPuvAfterLeave, NewPuvAfterSwitch, LambdaSum = SelectLambdaSum(JointCom);
537  CurPuv = 1 - exp(- LambdaSum);
538  NewPuvAfterLeave = 1 - exp(- LambdaSum + LambdaV[CurCID]);
539  NewPuvAfterJoin = 1 - exp(- LambdaSum - LambdaV[NewCID]);
540  NewPuvAfterSwitch = 1 - exp(- LambdaSum - LambdaV[NewCID] + LambdaV[CurCID]);
541  if (JointCom.Len() == 1 || NewPuvAfterLeave == 0.0) {
542  NewPuvAfterLeave = PNoCom;
543  }
544  Delta += (log(NewPuvAfterSwitch) + log(CurPuv) - log(NewPuvAfterLeave) - log(NewPuvAfterJoin));
545  if (_isnan(Delta)) {
546  printf("NS:%f C:%f NL:%f NJ:%f PNoCom:%f", NewPuvAfterSwitch, CurPuv, NewPuvAfterLeave, NewPuvAfterJoin, PNoCom.Val);
547  }
548  IAssert(!_isnan(Delta));
549  }
550  return Delta;
551 }
552 
553 // Get communities whose p_c is higher than 1 - QMax.
554 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, const double QMax) {
555  TFltV TmpQV;
556  GetCmtyVV(CmtyVV, TmpQV, QMax);
557 }
558 
559 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, TFltV& QV, const double QMax) {
560  CmtyVV.Gen(CIDNSetV.Len(), 0);
561  QV.Gen(CIDNSetV.Len(), 0);
562  TIntFltH CIDLambdaH(CIDNSetV.Len());
563  for (int c = 0; c < CIDNSetV.Len(); c++) {
564  CIDLambdaH.AddDat(c, LambdaV[c]);
565  }
566  CIDLambdaH.SortByDat(false);
567  for (int c = 0; c < CIDNSetV.Len(); c++) {
568  int CID = CIDLambdaH.GetKey(c);
569  IAssert(LambdaV[CID] >= MinLambda);
570  double Q = exp( - (double) LambdaV[CID]);
571  if (Q > QMax) { continue; }
572  TIntV CmtyV;
573  CIDNSetV[CID].GetKeyV(CmtyV);
574  if (CmtyV.Len() == 0) { continue; }
575  if (CID == BaseCID) { //if the community is the base community(epsilon community), discard
576  IAssert(CmtyV.Len() == G->GetNodes());
577  } else {
578  CmtyVV.Add(CmtyV);
579  QV.Add(Q);
580  }
581  }
582 }
583 
584 void TAGMFit::SetCmtyVV(const TVec<TIntV>& CmtyVV) {
585  CIDNSetV.Gen(CmtyVV.Len());
586  for (int c = 0; c < CIDNSetV.Len(); c++) {
587  CIDNSetV[c].AddKeyV(CmtyVV[c]);
588  for (int j = 0; j < CmtyVV[c].Len(); j++) { IAssert(G->IsNode(CmtyVV[c][j])); } // check whether the member nodes exist in the graph
589  }
590  InitNodeData();
592 }
593 
594 // Gradient of likelihood for P_c.
596  GradV.Gen(LambdaV.Len());
597  TFltV SumEdgeProbsV(LambdaV.Len());
598  for (int e = 0; e < EdgeComVH.Len(); e++) {
599  TIntSet& JointCom = EdgeComVH[e];
600  double LambdaSum = SelectLambdaSum(JointCom);
601  double Puv = 1 - exp(- LambdaSum);
602  if (JointCom.Len() == 0) { Puv = PNoCom; }
603  for (TIntSet::TIter SI = JointCom.BegI(); SI < JointCom.EndI(); SI++) {
604  SumEdgeProbsV[SI.GetKey()] += (1 - Puv) / Puv;
605  }
606  }
607  for (int k = 0; k < LambdaV.Len(); k++) {
608  int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2;
609  int NotEdgesInCom = MaxEk - ComEdgesV[k];
610  GradV[k] = SumEdgeProbsV[k] - (double) NotEdgesInCom;
611  if (LambdaV[k] > 0.0 && RegCoef > 0.0) { //if regularization exists
612  GradV[k] -= RegCoef;
613  }
614  }
615 }
616 
617 // Compute sum of lambda_c (which is log (1 - p_c)) over C_uv (ComK). It is used to compute edge probability P_uv.
618 double TAGMFit::SelectLambdaSum(const TIntSet& ComK) {
619  return SelectLambdaSum(LambdaV, ComK);
620 }
621 
622 double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) {
623  double Result = 0.0;
624  for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) {
625  IAssert(NewLambdaV[SI.GetKey()] >= 0);
626  Result += NewLambdaV[SI.GetKey()];
627  }
628  return Result;
629 }
630 
631 // Compute the empirical edge probability between a pair of nodes who share no community (epsilon), based on current community affiliations.
632 double TAGMFit::CalcPNoComByCmtyVV(const int& SamplePairs) {
633  TIntV NIdV;
634  G->GetNIdV(NIdV);
635  uint64 PairNoCom = 0, EdgesNoCom = 0;
636  for (int u = 0; u < NIdV.Len(); u++) {
637  for (int v = u + 1; v < NIdV.Len(); v++) {
638  int SrcNID = NIdV[u], DstNID = NIdV[v];
639  TIntSet JointCom;
640  TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID),NIDComVH.GetDat(DstNID),JointCom);
641  if(JointCom.Len() == 0) {
642  PairNoCom++;
643  if (G->IsEdge(SrcNID, DstNID)) { EdgesNoCom++; }
644  if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
645  }
646  }
647  if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
648  }
649  double DefaultVal = 1.0 / (double)G->GetNodes() / (double)G->GetNodes();
650  if (EdgesNoCom > 0) {
651  PNoCom = (double) EdgesNoCom / (double) PairNoCom;
652  } else {
653  PNoCom = DefaultVal;
654  }
655  printf("%s / %s edges without joint com detected (PNoCom = %f)\n", TUInt64::GetStr(EdgesNoCom).CStr(), TUInt64::GetStr(PairNoCom).CStr(), PNoCom.Val);
656  return PNoCom;
657 }
658 
660  TIntFltH CIDLambdaH(CIDNSetV.Len());
661  for (int c = 0; c < CIDNSetV.Len(); c++) {
662  CIDLambdaH.AddDat(c, LambdaV[c]);
663  }
664  CIDLambdaH.SortByDat(false);
665  int Coms = 0;
666  for (int i = 0; i < LambdaV.Len(); i++) {
667  int CID = CIDLambdaH.GetKey(i);
668  if (LambdaV[CID] <= 0.0001) { continue; }
669  printf("P_c : %.3f Com Sz: %d, Total Edges inside: %d \n", 1.0 - exp(- LambdaV[CID]), CIDNSetV[CID].Len(), (int) ComEdgesV[CID]);
670  Coms++;
671  }
672  printf("%d Communities, Total Memberships = %d, Likelihood = %.2f, Epsilon = %f\n", Coms, NIDCIDPrS.Len(), Likelihood(), PNoCom.Val);
673 }
#define IAssert(Cond)
Definition: bd.h:262
static const T & Mn(const T &LVal, const T &RVal)
Definition: xmath.h:36
void Load(TSIn &SIn, const int &RndSeed=0)
Definition: agmfit.cpp:24
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
static void GetNbhCom(const PUNGraph &Graph, const int NID, TIntSet &NBCmtyS)
Definition: agm.cpp:706
static double Norm(const TFltV &x)
Definition: linalg.cpp:324
TPair< TFlt, TInt > TFltIntPr
Definition: ds.h:97
int Len() const
Definition: dt.h:490
Definition: tm.h:355
TRnd Rnd
Definition: agmfit.h:17
#define Kilo(n)
Definition: gbase.h:3
double SeekSwitch(const int &UID, const int &CurCID, const int &NewCID)
Definition: agmfit.cpp:525
static double SumVec(const TFltV &x)
Definition: linalg.cpp:279
static void GetNodeMembership(THash< TInt, TIntSet > &NIDComVH, const TVec< TIntV > &CmtyVV)
get hash table of
Definition: agm.cpp:335
void SavePng(const int &SizeX=1000, const int &SizeY=800, const TStr &Comment=TStr())
Definition: gnuplot.h:120
static const T & Mx(const T &LVal, const T &RVal)
Definition: xmath.h:32
void SetDefaultPNoCom()
Set Epsilon (the probability that two nodes sharing no communities connect) to the default value...
Definition: agmfit.cpp:72
void Save(TSOut &SOut) const
Definition: dt.h:1153
THash< TIntPr, TInt > NIDCIDPrS
pairs (for sampling MCMC moves).
Definition: agmfit.h:19
TIter BegI() const
Definition: shash.h:1105
void GetNIdV(TIntV &NIdV) const
Gets a vector IDs of all nodes in the graph.
Definition: graph.cpp:153
void SetTitle(const TStr &PlotTitle)
Definition: gnuplot.h:70
double GetStepSizeByLineSearchForLambda(const TFltV &DeltaV, const TFltV &GradV, const double &Alpha, const double &Beta)
Step size search for updating P_c (which is parametarized by regularization parameter lambda)...
Definition: agmfit.cpp:109
double Likelihood()
COMMENT.
Definition: agmfit.cpp:104
double Val
Definition: dt.h:1388
int GetEdges() const
Returns the number of edges in the graph.
Definition: graph.cpp:82
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Node iterator. Only forward iteration (operator++) is supported.
Definition: graph.h:68
TFltV LambdaV
Parametrization of P_c (edge probability in community c), P_c = 1 - exp(-lambda). ...
Definition: agmfit.h:16
void Save(TSOut &SOut) const
Definition: hash.h:183
void PrintSummary()
COMMENT.
Definition: agmfit.cpp:659
TInt BaseCID
ID of the Epsilon-community (in case we fit P_c of the epsilon community). We do not fit for the Epsi...
Definition: agmfit.h:23
static void GetIntersection(const THashSet< TInt > &A, const THashSet< TInt > &B, THashSet< TInt > &C)
Definition: agm.cpp:404
void Save(TSOut &SOut) const
Saves the graph to a (binary) stream SOut.
Definition: graph.h:170
void SetDataPlotFNm(const TStr &DatFNm, const TStr &PltFNm)
Definition: gnuplot.h:74
THash< TIntPr, TIntSet > EdgeComVH
Edge -> Shared Community ID Set.
Definition: agmfit.h:12
void Load(TSIn &SIn)
Definition: ds.h:946
static double GetConductance(const PUNGraph &Graph, const TIntSet &CmtyS, const int Edges)
Definition: agm.cpp:683
void GetEdgeJointCom()
For each (u, v) in edges, precompute C_uv (the set of communities nodes u and v share).
Definition: agmfit.cpp:50
int GetNodes() const
Returns the number of nodes in the graph.
Definition: graph.h:192
void Load(TSIn &SIn)
Definition: hash.h:177
Definition: fl.h:58
void Save(TSOut &SOut) const
Definition: ds.h:954
int GetRndNId(TRnd &Rnd=TInt::Rnd)
Returns an ID of a random node in the graph.
Definition: graph.h:289
TPair< TInt, TFlt > TIntFltPr
Definition: ds.h:87
int GetDeg() const
Returns degree of the current node.
Definition: graph.h:90
const char * GetTmStr() const
Definition: tm.h:370
void GradLogLForLambda(TFltV &GradV)
Definition: agmfit.cpp:595
void DelKey(const TKey &Key)
Definition: hash.h:404
double CalcPNoComByCmtyVV(const int &SamplePairs=-1)
Compute the empirical edge probability between a pair of nodes who share no community (epsilon)...
Definition: agmfit.cpp:632
void Save(TSOut &SOut)
Definition: agmfit.cpp:8
void LeaveCom(const int &NID, const int &CID)
After MCMC, NID leaves community CID.
Definition: agmfit.cpp:293
TFlt MinLambda
Minimum value of regularization parameter lambda (default = 1e-5).
Definition: agmfit.h:20
void GetCmtyVV(TVec< TIntV > &CmtyVV, const double QMax=2.0)
Get communities whose p_c is higher than 1 - QMax.
Definition: agmfit.cpp:554
int RemoveEmptyCom()
Remove all communities with no members.
Definition: agmfit.cpp:459
int MLEGradAscentGivenCAG(const double &Thres=0.001, const int &MaxIter=10000, const TStr PlotNm=TStr())
Gradient descent for p_c while keeping the community affiliation graph (CAG) fixed.
Definition: agmfit.cpp:130
void Gen(const int &ExpectVals)
Definition: hash.h:222
THash< TInt, TIntSet > NIDComVH
Node ID -> Communitie IDs the node belongs to.
Definition: agmfit.h:13
unsigned long long uint64
Definition: bd.h:38
double SelectLambdaSum(const TIntSet &ComK)
Compute sum of lambda_c (which is log (1 - p_c)) over C_uv (ComK). The function is used to compute ed...
Definition: agmfit.cpp:618
static void GenCmtyVVFromPL(TVec< TIntV > &CmtyVV, const PUNGraph &Graph, const int &Nodes, const int &Coms, const double &ComSzAlpha, const double &MemAlpha, const int &MinSz, const int &MaxSz, const int &MinK, const int &MaxK, TRnd &Rnd)
Generate bipartite community affiliation from given power law coefficients for membership distributio...
Definition: agm.cpp:101
TFlt RegCoef
Regularization parameter when we fit for P_c (for finding # communities).
Definition: agmfit.h:22
TNodeI GetNI(const int &NId) const
Returns an iterator referring to the node of ID NId in the graph.
Definition: graph.h:241
void Load(TSIn &SIn)
Definition: dt.h:1152
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
void NeighborComInit(const int InitComs)
Initialize node community memberships using best neighborhood communities (see D. Gleich et al...
Definition: agmfit.cpp:186
void GetQV(TFltV &OutV)
Returns QV, a vector of (1 - p_c) for each community c.
Definition: agmfit.cpp:451
TNodeI GetRndNI(TRnd &Rnd=TInt::Rnd)
Returns an interator referring to a random node in the graph.
Definition: graph.h:291
void Tick()
Definition: tm.h:364
Definition: fl.h:128
static PUNGraph Load(TSIn &SIn)
Static constructor that loads the graph from a stream SIn and returns a pointer to it...
Definition: graph.h:178
TIter EndI() const
Definition: shash.h:1112
int Len() const
Definition: shash.h:1121
Definition: ds.h:32
void PutSeed(const int &_Seed)
Definition: dt.cpp:18
int AddKey(const TKey &Key)
Definition: hash.h:373
double SeekJoin(const int &UID, const int &CID)
Compute the change in likelihood (Delta) if node UID joins community CID.
Definition: agmfit.cpp:502
TStr GetStr() const
Definition: dt.h:1363
void RunMCMC(const int &MaxIter, const int &EvalLambdaIter, const TStr &PlotFPrx=TStr())
Main procedure for fitting the AGM to a given graph using MCMC.
Definition: agmfit.cpp:366
Definition: dt.h:412
bool Empty() const
Definition: dt.h:491
void InitNodeData()
COMMENT.
Definition: agmfit.cpp:264
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
double SeekLeave(const int &UID, const int &CID)
Compute the change in likelihood (Delta) if node UID leaves community CID.
Definition: agmfit.cpp:475
void JoinCom(const int &NID, const int &JoinCID)
Definition: agmfit.cpp:309
void RandomInitCmtyVV(const int InitComs, const double ComSzAlpha=1.3, const double MemAlpha=1.8, const int MinComSz=8, const int MaxComSz=200, const int MinMem=1, const int MaxMem=10)
Randomly initialize bipartite community affiliation graph.
Definition: agmfit.cpp:42
void Load(TSIn &SIn)
Definition: dt.h:1405
TFlt MaxLambda
Maximum value of regularization parameter lambda (default = 10).
Definition: agmfit.h:21
TVec< TIntSet > CIDNSetV
Community ID -> Member Node ID Sets.
Definition: agmfit.h:11
TNodeI EndNI() const
Returns an iterator referring to the past-the-end node in the graph.
Definition: graph.h:239
double GetUniDev()
Definition: dt.h:30
void AddBaseCmty()
Add Epsilon community (base community which includes all nodes) into community affiliation graph...
Definition: agmfit.cpp:251
void SetCmtyVV(const TVec< TIntV > &CmtyVV)
COMMENT.
Definition: agmfit.cpp:584
TVal1 Val1
Definition: ds.h:34
int AddPlot(const TIntV &YValV, const TGpSeriesTy &SeriesTy=gpwLinesPoints, const TStr &Label=TStr(), const TStr &Style=TStr())
Definition: gnuplot.cpp:186
int GetNbrNId(const int &NodeN) const
Returns ID of NodeN-th neighboring node.
Definition: graph.h:111
TVal2 Val2
Definition: ds.h:35
bool IsNode(const int &NId) const
Tests whether ID NId is a node.
Definition: graph.h:235
static double DotProduct(const TFltV &x, const TFltV &y)
Definition: linalg.cpp:165
int GetRndKeyId(TRnd &Rnd) const
Get an index of a random element. If the hash table has many deleted keys, this may take a long time...
Definition: hash.h:444
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:523
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int GetId() const
Returns ID of the current node.
Definition: graph.h:88
bool IsEdge(const int &SrcNId, const int &DstNId) const
Tests whether an edge between node IDs SrcNId and DstNId exists in the graph.
Definition: graph.cpp:137
bool IsKey(const TKey &Key) const
Definition: hash.h:258
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
TNodeI BegNI() const
Returns an iterator referring to the first node in the graph.
Definition: graph.h:237
void DelSelfEdges(const PGraph &Graph)
Removes all the self-edges from the graph.
Definition: alg.h:419
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
int Len() const
Definition: hash.h:228
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
static void PlotValV(const TVec< TVal1 > &ValV, const TStr &OutFNmPref, const TStr &Desc="", const TStr &XLabel="", const TStr &YLabel="", const TGpScaleTy &ScaleTy=gpsAuto, const bool &PowerFit=false, const TGpSeriesTy &SeriesTy=gpwLinesPoints)
Definition: gnuplot.h:398
PUNGraph G
Graph to fit.
Definition: agmfit.h:10
TIntV ComEdgesV
The number of edges in each community.
Definition: agmfit.h:14
void SampleTransition(int &NID, int &JoinCID, int &LeaveCID, double &DeltaL)
Sample MMCM transitions: Choose among (join, leave, switch), and then sample (NID, CID).
Definition: agmfit.cpp:325
void Save(TSOut &SOut) const
Definition: dt.h:1402
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:252
TFlt PNoCom
Probability of edge when two nodes share no community (epsilon in the paper).
Definition: agmfit.h:15
void RandomInit(const int &MaxK)
COMMENT.
Definition: agmfit.cpp:169
THash< TIntPr, TFlt > NIDCIDPrH
pairs (for sampling MCMC moves).
Definition: agmfit.h:18
static const double Mn
Definition: dt.h:1390