SNAP Library 6.0, User Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
agmattr.cpp
Go to the documentation of this file.
1 
2 #include "stdafx.h"
3 #include "agmattr.h"
4 #include "Snap.h"
5 #include "agm.h"
6 
7 
8 
9 void TCesna::RandomInit(const int InitComs) {
10  F.Gen(G->GetNodes());
11  SumFV.Gen(InitComs);
12  NumComs = InitComs;
13  for (int u = 0; u < F.Len(); u++) {
14  //assign to just one community
15  int Mem = G->GetNI(u).GetDeg();
16  if (Mem > 10) { Mem = 10; }
17  for (int c = 0; c < Mem; c++) {
18  int CID = Rnd.GetUniDevInt(InitComs);
19  AddCom(u, CID, Rnd.GetUniDev());
20  }
21  }
22  //assign a member to zero-member community (if any)
23  for (int c = 0; c < SumFV.Len(); c++) {
24  if (SumFV[c] == 0.0) {
25  int UID = Rnd.GetUniDevInt(G->GetNodes());
26  AddCom(UID, c, Rnd.GetUniDev());
27  }
28  }
29  InitW();
30 }
31 
32 void TCesna::NeighborComInit(const int InitComs) {
33  //initialize with best neighborhood communities (Gleich et.al. KDD'12)
34  TFltIntPrV NIdPhiV(F.Len(), 0);
35  TCesnaUtil::GetNIdPhiV<PUNGraph>(G, NIdPhiV);
36  NeighborComInit(NIdPhiV, InitComs);
37 }
38 
39 void TCesna::NeighborComInit(TFltIntPrV& NIdPhiV, const int InitComs) {
40  //initialize with best neighborhood communities (Gleich et.al. KDD'12)
41  NIdPhiV.Sort(true);
42  F.Gen(G->GetNodes());
43  SumFV.Gen(InitComs);
44  NumComs = InitComs;
45  TIntSet InvalidNIDS(F.Len());
46  TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG
47  //choose nodes with local minimum in conductance
48  int CurCID = 0;
49  for (int ui = 0; ui < NIdPhiV.Len(); ui++) {
50  int UID = NIdPhiV[ui].Val2;
51  fflush(stdout);
52  if (InvalidNIDS.IsKey(UID)) { continue; }
53  ChosenNIDV.Add(UID); //FOR DEBUG
54  //add the node and its neighbors to the current community
55  AddCom(UID, CurCID, 1.0);
56  TUNGraph::TNodeI NI = G->GetNI(UID);
57  fflush(stdout);
58  for (int e = 0; e < NI.GetDeg(); e++) {
59  AddCom(NI.GetNbrNId(e), CurCID, 1.0);
60  }
61  //exclude its neighbors from the next considerations
62  for (int e = 0; e < NI.GetDeg(); e++) {
63  InvalidNIDS.AddKey(NI.GetNbrNId(e));
64  }
65  CurCID++;
66  fflush(stdout);
67  if (CurCID >= NumComs) { break; }
68  }
69  if (NumComs > CurCID) {
70  printf("%d communities needed to fill randomly\n", NumComs - CurCID);
71  }
72  //assign a member to zero-member community (if any)
73  for (int c = 0; c < SumFV.Len(); c++) {
74  if (SumFV[c] == 0.0) {
75  int ComSz = 10;
76  for (int u = 0; u < ComSz; u++) {
77  int UID = Rnd.GetUniDevInt(G->GetNodes());
78  AddCom(UID, c, Rnd.GetUniDev());
79  }
80  }
81  }
82  InitW();
83 }
84 
85 void TCesna::SetCmtyVV(const TVec<TIntV>& CmtyVV) {
86  F.Gen(G->GetNodes());
87  SumFV.Gen(CmtyVV.Len());
88  NumComs = CmtyVV.Len();
89  InitW();
90  for (int c = 0; c < CmtyVV.Len(); c++) {
91  for (int u = 0; u < CmtyVV[c].Len(); u++) {
92  int UID = CmtyVV[c][u];
93  if (! NIDToIdx.IsKey(UID)) { continue; }
94  AddCom(NIDToIdx.GetKeyId(UID), c, 1.0);
95  }
96  }
97 }
98 
99 void TCesna::SetGraph(const PUNGraph& GraphPt, const THash<TInt, TIntV>& NIDAttrH) {
100  HOVIDSV.Gen(GraphPt->GetNodes());
101  HOKIDSV.Gen(GraphPt->GetNodes());
102  X.Gen(GraphPt->GetNodes());
103  TIntV NIDV;
104  GraphPt->GetNIdV(NIDV);
105  NIDToIdx.Gen(NIDV.Len());
106  NIDToIdx.AddKeyV(NIDV);
107  // check that nodes IDs are {0,1,..,Nodes-1}
108  printf("rearrage nodes\n");
109  G = TSnap::GetSubGraph(GraphPt, NIDV, true);
110  for (int nid = 0; nid < G->GetNodes(); nid++) {
111  IAssert(G->IsNode(nid));
112  }
114 
115  PNoCom = 1.0 / (double) G->GetNodes();
116  DoParallel = false;
117  if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow
118  NegWgt = 1.0;
119 
120  // add node attributes, find number of attributes
121  int NumAttr = 0;
122  for (int u = 0; u < NIDAttrH.Len(); u++) {
123  int UID = NIDAttrH.GetKey(u);
124  if (! NIDToIdx.IsKey(UID)) { continue; }
125  X[NIDToIdx.GetKeyId(UID)].Gen(NIDAttrH[u].Len());
126  for (int k = 0; k < NIDAttrH[u].Len(); k++) {
127  int KID = NIDAttrH[u][k];
128  IAssert (KID >= 0);
129  X[NIDToIdx.GetKeyId(UID)].AddKey(KID);
130  if (NumAttr < KID + 1) { NumAttr = KID + 1; }
131  }
132  }
133  Attrs = NumAttr;
134  InitW();
135 }
136 
137 double TCesna::Likelihood(const bool _DoParallel) {
138  TExeTm ExeTm;
139  double L = 0.0;
140  if (_DoParallel) {
141  #pragma omp parallel for
142  for (int u = 0; u < F.Len(); u++) {
143  double LU = LikelihoodForRow(u);
144  #pragma omp atomic
145  L += LU;
146  }
147  }
148  else {
149  for (int u = 0; u < F.Len(); u++) {
150  double LU = LikelihoodForRow(u);
151  L += LU;
152  }
153  }
154  return L;
155 }
156 
157 double TCesna::LikelihoodForRow(const int UID) {
158  return LikelihoodForRow(UID, F[UID]);
159 }
160 
161 double TCesna::LikelihoodForRow(const int UID, const TIntFltH& FU) {
162  double L = 0.0;
163  TFltV HOSumFV; //adjust for Fv of v hold out
164  if (HOVIDSV[UID].Len() > 0) {
165  HOSumFV.Gen(SumFV.Len());
166 
167  for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
168  for (int c = 0; c < SumFV.Len(); c++) {
169  HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
170  }
171  }
172  }
173 
174  TUNGraph::TNodeI NI = G->GetNI(UID);
175  for (int e = 0; e < NI.GetDeg(); e++) {
176  int v = NI.GetNbrNId(e);
177  if (v == UID) { continue; }
178  if (HOVIDSV[UID].IsKey(v)) { continue; }
179  L += log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]);
180  }
181  for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) {
182  double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
183  L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat();
184  }
185  //add regularization
186  if (RegCoef > 0.0) { //L1
187  L -= RegCoef * Sum(FU);
188  }
189  if (RegCoef < 0.0) { //L2
190  L += RegCoef * Norm2(FU);
191  }
192  L *= (1.0 - WeightAttr);
193  // add attribute part
194  for (int k = 0; k < Attrs; k++) {
195  if (HOKIDSV[UID].IsKey(k)) { continue; }
196  L += WeightAttr * LikelihoodAttrKForRow(UID, k, FU);
197  }
198  return L;
199 }
200 
201 
202 double TCesna::LikelihoodAttrKForRow(const int UID, const int K, const TIntFltH& FU, const TFltV& WK) {
203  double Prob = PredictAttrK(FU, WK);
204  double L = 0.0;
205  if (GetAttr(UID, K)) {
206  L = Prob == 0.0? -100.0: log(Prob);
207  } else {
208  L = Prob == 1.0? -100.0: log(1.0 - Prob);
209  }
210  return L;
211 }
212 
213 void TCesna::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) {
214  GradU.Gen(CIDSet.Len());
215  TFltV HOSumFV; //adjust for Fv of v hold out
216  if (HOVIDSV[UID].Len() > 0) {
217  HOSumFV.Gen(SumFV.Len());
218 
219  for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
220  for (int c = 0; c < SumFV.Len(); c++) {
221  HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
222  }
223  }
224  }
225  TUNGraph::TNodeI NI = G->GetNI(UID);
226  int Deg = NI.GetDeg();
227  TFltV PredV(Deg), GradV(CIDSet.Len());
228  TIntV CIDV(CIDSet.Len());
229  for (int e = 0; e < Deg; e++) {
230  if (NI.GetNbrNId(e) == UID) { continue; }
231  if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
232  PredV[e] = Prediction(UID, NI.GetNbrNId(e));
233  }
234 
235  for (int c = 0; c < CIDSet.Len(); c++) {
236  int CID = CIDSet.GetKey(c);
237  double Val = 0.0;
238  for (int e = 0; e < Deg; e++) {
239  int VID = NI.GetNbrNId(e);
240  if (VID == UID) { continue; }
241  if (HOVIDSV[UID].IsKey(VID)) { continue; }
242  Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
243  }
244  double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
245  Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
246  CIDV[c] = CID;
247  GradV[c] = Val;
248  }
249  //add regularization
250  if (RegCoef > 0.0) { //L1
251  for (int c = 0; c < GradV.Len(); c++) {
252  GradV[c] -= RegCoef;
253  }
254  }
255  if (RegCoef < 0.0) { //L2
256  for (int c = 0; c < GradV.Len(); c++) {
257  GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]);
258  }
259  }
260  for (int c = 0; c < GradV.Len(); c++) {
261  GradV[c] *= (1.0 - WeightAttr);
262  }
263  //add attribute part
264  TFltV AttrPredV(Attrs);
265  for (int k = 0; k < Attrs; k++) {
266  if (HOKIDSV[UID].IsKey(k)) { continue; }
267  AttrPredV[k] = PredictAttrK(F[UID], W[k]);
268  }
269  for (int c = 0; c < GradV.Len(); c++) {
270  for (int k = 0; k < Attrs; k++) {
271  if (HOKIDSV[UID].IsKey(k)) { continue; }
272  GradV[c] += WeightAttr * (GetAttr(UID, k) - AttrPredV[k]) * GetW(CIDV[c], k);
273  }
274  }
275 
276 
277  for (int c = 0; c < GradV.Len(); c++) {
278  if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; }
279  if (fabs(GradV[c]) < 0.0001) { continue; }
280  GradU.AddDat(CIDV[c], GradV[c]);
281  }
282  for (int c = 0; c < GradU.Len(); c++) {
283  if (GradU[c] >= 10) { GradU[c] = 10; }
284  if (GradU[c] <= -10) { GradU[c] = -10; }
285  IAssert(GradU[c] >= -10);
286  }
287 }
288 
290  TVec<TFltV> TmpV;
291  GetCmtyVV(CmtyVV, TmpV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3);
292 }
293 
294 
296 void TCesna::GetCmtyVV(TVec<TIntV>& CmtyVV, TVec<TFltV>& Wck, const double Thres, const int MinSz) {
297  CmtyVV.Gen(NumComs, 0);
298  Wck.Gen(NumComs, 0);
299  TIntFltH CIDSumFH(NumComs);
300  for (int c = 0; c < SumFV.Len(); c++) {
301  CIDSumFH.AddDat(c, SumFV[c]);
302  }
303  CIDSumFH.SortByDat(false);
304  for (int c = 0; c < NumComs; c++) {
305  int CID = CIDSumFH.GetKey(c);
306  TIntFltH NIDFucH(F.Len() / 10);
307  TIntV CmtyV;
308  IAssert(SumFV[CID] == CIDSumFH.GetDat(CID));
309  if (SumFV[CID] < Thres) { continue; }
310  for (int u = 0; u < F.Len(); u++) {
311  int NID = NIDToIdx[u];
312  if (GetCom(u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(u, CID)); }
313  }
314  NIDFucH.SortByDat(false);
315  NIDFucH.GetKeyV(CmtyV);
316  if (CmtyV.Len() < MinSz) { continue; }
317  CmtyVV.Add(CmtyV);
318  TFltV WV(Attrs);
319  for (int k = 0; k < Attrs; k++) {
320  WV[k] = GetW(CID, k);
321  }
322  Wck.Add(WV);
323  }
324  if ( NumComs != CmtyVV.Len()) {
325  printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len());
326  }
327 }
328 
330  GetCmtyVVUnSorted(CmtyVV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3);
331 }
332 
333 void TCesna::GetCmtyVVUnSorted(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) {
334  CmtyVV.Gen(NumComs, 0);
335  for (int c = 0; c < NumComs; c++) {
336  TIntV CmtyV;
337  for (int u = 0; u < G->GetNodes(); u++) {
338  if (GetCom(u, c) > Thres) { CmtyV.Add(NIDToIdx[u]); }
339  }
340  if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); }
341  }
342  if ( NumComs != CmtyVV.Len()) {
343  printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len());
344  }
345 }
346 
348 int TCesna::FindComs(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const bool UseBIC, const double HOFrac, const double StepAlpha, const double StepBeta) {
349  double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs);
350  TIntV ComsV;
351  ComsV.Add(MinComs);
352  while (ComsV.Len() < DivComs) {
353  int NewComs = int(ComsV.Last() * ComsGap);
354  if (NewComs == ComsV.Last().Val) { NewComs++; }
355  ComsV.Add(NewComs);
356  }
357  if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); }
358  return FindComs(ComsV, UseBIC, HOFrac, NumThreads, OutFNm, StepAlpha, StepBeta);
359 }
360 
361 int TCesna::FindComs(TIntV& ComsV, const bool UseBIC, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) {
362  if (ComsV.Len() == 0) {
363  int MaxComs = G->GetNodes() / 5;
364  ComsV.Add(2);
365  while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); }
366  }
367  int MaxIterCV = 3;
368 
369  TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV), HoldOutSetsAttr(MaxIterCV);
370  TFltIntPrV NIdPhiV;
371  TCesnaUtil::GetNIdPhiV<PUNGraph>(G, NIdPhiV);
372  if (! UseBIC) { //if edges are many enough, use CV
373  //printf("generating hold out set\n");
374  TIntV NIdV1, NIdV2;
375  G->GetNIdV(NIdV1);
376  G->GetNIdV(NIdV2);
377  for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
378  // generate holdout sets
379  TCesnaUtil::GenHoldOutPairs(G, HoldOutSets[IterCV], HOFrac, Rnd);
380  GenHoldOutAttr(HOFrac, HoldOutSetsAttr[IterCV]);
381  }
382  //printf("hold out set generated\n");
383  }
384 
385  TFltV HOLV(ComsV.Len());
386  TIntFltPrV ComsLV;
387  for (int c = 0; c < ComsV.Len(); c++) {
388  const int Coms = ComsV[c];
389  //printf("Try number of Coms:%d\n", Coms);
390 
391  if (! UseBIC) { //if edges are many enough, use CV
392  for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
393  HOVIDSV = HoldOutSets[IterCV];
394  HOKIDSV = HoldOutSetsAttr[IterCV];
395  NeighborComInit(NIdPhiV, Coms);
396  //printf("Initialized\n");
397 
398  if (NumThreads == 1) {
399  //printf("MLE without parallelization begins\n");
400  MLEGradAscent(0.01, 100 * G->GetNodes(), "", StepAlpha, StepBeta);
401  //printf("likelihood: train:%f, attr:%f, hold:%f\n", Likelihood(), LikelihoodAttr(), LikelihoodHoldOut());
402  } else {
403  //printf("MLE with parallelization begins\n");
404  MLEGradAscentParallel(0.01, 100, NumThreads, "", StepAlpha, StepBeta);
405  }
406  double HOL = LikelihoodHoldOut();
407  HOL = HOL < 0? HOL: TFlt::Mn;
408  HOLV[c] += HOL;
409  }
410  }
411  else {
412  HOVIDSV.Gen(G->GetNodes());
413  HOKIDSV.Gen(G->GetNodes());
414  if (NumThreads == 1) {
415  MLEGradAscent(0.005, 100 * G->GetNodes(), "", StepAlpha, StepBeta);
416  printf("likelihood: train:%f, attr:%f, hold:%f\n", Likelihood(), LikelihoodAttr(), LikelihoodHoldOut());
417  } else {
418  MLEGradAscentParallel(0.005, 100, NumThreads, "", StepAlpha, StepBeta);
419  }
420  //int NumParams = G->GetNodes() * Coms + Coms * Attrs;
421  double NumParams = (1.0 - WeightAttr) * Coms + WeightAttr * Coms * Attrs;
422  double Observations = (1.0 - WeightAttr) * G->GetNodes() * (G->GetNodes() - 1) / 2 + WeightAttr * G->GetNodes() * Attrs;
423  double BIC = 2 * Likelihood() - NumParams * log (Observations);
424  HOLV[c] = BIC;
425  }
426  }
427  int EstComs = 2;
428  double MaxL = TFlt::Mn;
429  if (UseBIC) {
430  printf("Number of communities vs likelihood (criterion: BIC)\n");
431  } else {
432  printf("Number of communities vs likelihood (criterion: Cross validation)\n");
433  }
434  for (int c = 0; c < ComsV.Len(); c++) {
435  ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val));
436  printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val);
437  if (MaxL < HOLV[c]) {
438  MaxL = HOLV[c];
439  EstComs = ComsV[c];
440  }
441  }
442  printf("\n");
443  RandomInit(EstComs);
444  HOVIDSV.Gen(G->GetNodes());
445  HOKIDSV.Gen(G->GetNodes());
446  if (! PlotLFNm.Empty()) {
447  TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood");
448  }
449  return EstComs;
450 }
451 
453  double L = 0.0;
454  for (int u = 0; u < HOVIDSV.Len(); u++) {
455  for (int e = 0; e < HOVIDSV[u].Len(); e++) {
456  int VID = HOVIDSV[u][e];
457  if (VID == u) { continue; }
458  double Pred = Prediction(u, VID);
459  if (G->IsEdge(u, VID)) {
460  L += log(1.0 - Pred);
461  }
462  else {
463  L += NegWgt * log(Pred);
464  }
465  //printf("NODE %d, %d: Dot Prod: %f, Prediction: %f L:%f\n", u, VID, DotProduct(F[u], F[VID]), Pred, L);
466  }
467  }
468  L *= (1.0 - WeightAttr);
469  //printf("likelihood for hold out network:%f\n", L);
470  for (int u = 0; u < HOKIDSV.Len(); u++) {
471  for (int e = 0; e < HOKIDSV[u].Len(); e++) {
472  IAssert(HOKIDSV[u][e] < Attrs);
473  L += WeightAttr * LikelihoodAttrKForRow(u, HOKIDSV[u][e]);
474  //printf("asbefsafd ATTRIBUTE %d, NODE %d:%f\n", u, HOKIDSV[u][e].Val, LikelihoodAttrKForRow(u, HOKIDSV[u][e]));
475  }
476  }
477  return L;
478 }
479 
480 double TCesna::GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) {
481  double StepSize = 1.0;
482  double InitLikelihood = LikelihoodForRow(UID);
483  TIntFltH NewVarV(DeltaV.Len());
484  for(int iter = 0; iter < MaxIter; iter++) {
485  for (int i = 0; i < DeltaV.Len(); i++){
486  int CID = DeltaV.GetKey(i);
487  double NewVal = GetCom(UID, CID) + StepSize * DeltaV.GetDat(CID);
488  if (NewVal < MinVal) { NewVal = MinVal; }
489  if (NewVal > MaxVal) { NewVal = MaxVal; }
490  NewVarV.AddDat(CID, NewVal);
491  }
492  if (LikelihoodForRow(UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) {
493  StepSize *= Beta;
494  } else {
495  break;
496  }
497  if (iter == MaxIter - 1) {
498  StepSize = 0.0;
499  break;
500  }
501  }
502  return StepSize;
503 }
504 
505 int TCesna::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
506  time_t InitTime = time(NULL);
507  TExeTm ExeTm, CheckTm;
508  int iter = 0, PrevIter = 0;
509  TIntFltPrV IterLV;
510  TUNGraph::TNodeI UI;
511  double PrevL = TFlt::Mn, CurL = 0.0;
512  TIntV NIdxV(F.Len(), 0);
513  for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
514  TIntFltH GradV;
515  //consider all C
516  TIntSet CIDSet(NumComs);
517  for (int c = 0; c < NumComs; c++) { CIDSet.AddKey(c); }
518  while(iter < MaxIter) {
519  NIdxV.Shuffle(Rnd);
520  for (int ui = 0; ui < F.Len(); ui++, iter++) {
521  int u = NIdxV[ui]; //
522  /*
523  //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
524  UI = G->GetNI(u);
525  TIntSet CIDSet(20 * UI.GetDeg());
526  for (int e = 0; e < UI.GetDeg(); e++) {
527  if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
528  TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
529  for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
530  CIDSet.AddKey(CI.GetKey());
531  }
532  }
533  for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
534  if (! CIDSet.IsKey(CI.GetKey())) {
535  DelCom(u, CI.GetKey());
536  }
537  }
538  if (CIDSet.Empty()) { continue; }
539  */
540 
541  GradientForRow(u, GradV, CIDSet);
542  if (Norm2(GradV) < 1e-4) { continue; }
543  double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta);
544  if (LearnRate == 0.0) { continue; }
545  for (int ci = 0; ci < GradV.Len(); ci++) {
546  int CID = GradV.GetKey(ci);
547  double Change = LearnRate * GradV.GetDat(CID);
548  double NewFuc = GetCom(u, CID) + Change;
549  if (NewFuc <= 0.0) {
550  DelCom(u, CID);
551  } else {
552  AddCom(u, CID, NewFuc);
553  }
554  }
555  if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) {
556  IterLV.Add(TIntFltPr(iter, Likelihood(false)));
557  }
558  }
559  // fit W (logistic regression)
560  for (int k = 0; k < Attrs; k++) {
561  TFltV GradWV(NumComs);
562  GradientForWK(GradWV, k);
563  if (TLinAlg::Norm2(GradWV) < 1e-4) { continue; }
564  double LearnRate = GetStepSizeByLineSearchForWK(k, GradWV, GradWV, StepAlpha, StepBeta);
565  if (LearnRate == 0.0) { continue; }
566  for (int c = 0; c < GradWV.Len(); c++){
567  W[k][c] += LearnRate * GradWV[c];
568  if (W[k][c] < MinValW) { W[k][c] = MinValW; }
569  if (W[k][c] > MaxValW) { W[k][c] = MaxValW; }
570  }
571  }
572  printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime);
573  fflush(stdout);
574  if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
575  PrevIter = iter;
576  CurL = Likelihood();
577  if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
578  printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL);
579  }
580  fflush(stdout);
581  if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
582  else { PrevL = CurL; }
583  }
584 
585  }
586  printf("\n");
587  printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
588  if (! PlotNm.Empty()) {
589  TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
590  }
591  return iter;
592 }
593 
594 int TCesna::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
595  //parallel
596  time_t InitTime = time(NULL);
597  uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs();
598  TExeTm ExeTm, CheckTm;
599  double PrevL = Likelihood(true);
600  TIntFltPrV IterLV;
601  int PrevIter = 0;
602  int iter = 0;
603  TIntV NIdxV(F.Len(), 0);
604  for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
605  TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization
606  NIDOPTV.PutAll(0);
607  TVec<TIntFltH> NewF(ChunkNum * ChunkSize);
608  TIntV NewNIDV(ChunkNum * ChunkSize);
609  for (iter = 0; iter < MaxIter; iter++) {
610  NIdxV.Clr(false);
611  for (int i = 0; i < F.Len(); i++) {
612  if (NIDOPTV[i] == 0) { NIdxV.Add(i); }
613  }
614  IAssert (NIdxV.Len() <= F.Len());
615  NIdxV.Shuffle(Rnd);
616  // compute gradient for chunk of nodes
617 #pragma omp parallel for schedule(static, 1)
618  for (int TIdx = 0; TIdx < ChunkNum; TIdx++) {
619  TIntFltH GradV;
620  for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) {
621  NewNIDV[ui] = -1;
622  if (ui >= NIdxV.Len()) { continue; }
623  int u = NIdxV[ui]; //
624  //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
625  TUNGraph::TNodeI UI = G->GetNI(u);
626  TIntSet CIDSet(5 * UI.GetDeg());
627  TIntFltH CurFU = F[u];
628  for (int e = 0; e < UI.GetDeg(); e++) {
629  if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
630  TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
631  for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
632  CIDSet.AddKey(CI.GetKey());
633  }
634  }
635  if (CIDSet.Empty()) {
636  CurFU.Clr();
637  }
638  else {
639  for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors
640  if (! CIDSet.IsKey(CI.GetKey())) {
641  CurFU.DelIfKey(CI.GetKey());
642  }
643  }
644  GradientForRow(u, GradV, CIDSet);
645  if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; }
646  double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta);
647  if (LearnRate == 0.0) { NewNIDV[ui] = -2; continue; }
648  for (int ci = 0; ci < GradV.Len(); ci++) {
649  int CID = GradV.GetKey(ci);
650  double Change = LearnRate * GradV.GetDat(CID);
651  double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change;
652  if (NewFuc <= 0.0) {
653  CurFU.DelIfKey(CID);
654  } else {
655  CurFU.AddDat(CID) = NewFuc;
656  }
657  }
658  CurFU.Defrag();
659  }
660  //store changes
661  NewF[ui] = CurFU;
662  NewNIDV[ui] = u;
663  }
664  }
665  int NumNoChangeGrad = 0;
666  int NumNoChangeStepSize = 0;
667  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
668  int NewNID = NewNIDV[ui];
669  if (NewNID == -1) { NumNoChangeGrad++; continue; }
670  if (NewNID == -2) { NumNoChangeStepSize++; continue; }
671  for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
672  SumFV[CI.GetKey()] -= CI.GetDat();
673  }
674  }
675 #pragma omp parallel for
676  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
677  int NewNID = NewNIDV[ui];
678  if (NewNID < 0) { continue; }
679  F[NewNID] = NewF[ui];
680  }
681  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
682  int NewNID = NewNIDV[ui];
683  if (NewNID < 0) { continue; }
684  for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
685  SumFV[CI.GetKey()] += CI.GetDat();
686  }
687  }
688  // update the nodes who are optimal
689  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
690  int NewNID = NewNIDV[ui];
691  if (NewNID < 0) { continue; }
692  TUNGraph::TNodeI UI = G->GetNI(NewNID);
693  NIDOPTV[NewNID] = 0;
694  for (int e = 0; e < UI.GetDeg(); e++) {
695  NIDOPTV[UI.GetNbrNId(e)] = 0;
696  }
697  }
698  int OPTCnt = 0;
699  for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } }
700  // fit W (logistic regression)
701  if (! PlotNm.Empty()) {
702  printf("\r%d iterations [%s] %s secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), TUInt64::GetStr(TSecTm::GetCurTm().GetAbsSecs()-StartTm).CStr());
703  if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); }
704  fflush(stdout);
705  }
706  if (iter == 0 || (iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) {
707  #pragma omp parallel for
708  for (int k = 0; k < Attrs; k++) {
709  TFltV GradWV(NumComs);
710  GradientForWK(GradWV, k);
711  if (TLinAlg::Norm2(GradWV) < 1e-4) { continue; }
712  double LearnRate = GetStepSizeByLineSearchForWK(k, GradWV, GradWV, StepAlpha, StepBeta);
713  if (LearnRate == 0.0) { continue; }
714  for (int c = 0; c < GradWV.Len(); c++){
715  W[k][c] += LearnRate * GradWV[c];
716  if (W[k][c] < MinValW) { W[k][c] = MinValW; }
717  if (W[k][c] > MaxValW) { W[k][c] = MaxValW; }
718  }
719  }
720  PrevIter = iter;
721  double CurL = Likelihood(true);
722  IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL));
723  printf("\r%d iterations, Likelihood: %f, Diff: %f [%lu secs]", iter, CurL, CurL - PrevL, time(NULL) - InitTime);
724  fflush(stdout);
725  if (CurL - PrevL <= Thres * fabs(PrevL)) {
726  break;
727  }
728  else {
729  PrevL = CurL;
730  }
731  }
732  }
733  if (! PlotNm.Empty()) {
734  printf("\nMLE completed with %d iterations(%s secs)\n", iter, TUInt64::GetStr(TSecTm::GetCurTm().GetAbsSecs()-StartTm).CStr());
735  TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
736  } else {
737  printf("\rMLE completed with %d iterations(%lu secs)", iter, time(NULL) - InitTime);
738  fflush(stdout);
739  }
740  return iter;
741 }
int MLEGradAscentParallel(const double &Thres, const int &MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha=0.3, const double StepBeta=0.1)
Definition: agmattr.cpp:594
#define IAssert(Cond)
Definition: bd.h:262
void GenHoldOutAttr(const double HOFrac, TVec< TIntSet > &HOSetV)
Definition: agmattr.h:397
PUNGraph G
Definition: agmattr.h:248
TFltV SumFV
Definition: agmattr.h:256
TFlt MaxVal
Definition: agmattr.h:262
TBool DoParallel
Definition: agmattr.h:269
Definition: tm.h:355
double Sum(const TIntFltH &UV)
Definition: agmattr.h:609
double GetCom(const int &NID, const int &CID)
Definition: agmattr.h:527
void AddKeyV(const TVec< TKey > &KeyV)
Definition: shash.h:1284
TFlt NegWgt
Definition: agmattr.h:265
TVec< TFltV > W
Definition: agmattr.h:251
int Val
Definition: dt.h:1139
int MLEGradAscent(const double &Thres, const int &MaxIter, const TStr PlotNm, const double StepAlpha=0.3, const double StepBeta=0.1)
Definition: agmattr.cpp:505
double Prediction(const TIntFltH &FU, const TIntFltH &FV)
Definition: agmattr.h:584
int GetKeyId(const TKey &Key) const
Definition: shash.h:1328
double DotProduct(const TIntFltH &UV, const TIntFltH &VV)
Definition: agmattr.h:564
void GetNIdV(TIntV &NIdV) const
Gets a vector IDs of all nodes in the graph.
Definition: graph.cpp:153
TFlt PNoCom
Definition: agmattr.h:268
TIter BegI() const
Definition: hash.h:213
TFlt MinVal
Definition: agmattr.h:261
int GetEdges() const
Returns the number of edges in the graph.
Definition: graph.cpp:82
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Node iterator. Only forward iteration (operator++) is supported.
Definition: graph.h:68
double LikelihoodAttr()
Definition: agmattr.h:371
void GetKeyV(TVec< TKey > &KeyV) const
Definition: shash.h:1347
void Gen(const int &ExpectVals)
Definition: shash.h:1115
double GetAttr(const int &NID, const int &K)
Definition: agmattr.h:534
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
TIntSet NIDToIdx
Definition: agmattr.h:254
TIter EndI() const
Definition: hash.h:218
void GetW(TVec< TFltV > &_W)
Definition: agmattr.h:346
double GetStepSizeByLineSearch(const int UID, const TIntFltH &DeltaV, const TIntFltH &GradV, const double &Alpha, const double &Beta, const int MaxIter=10)
Definition: agmattr.cpp:480
double PredictAttrK(const TIntFltH &FU, const TFltV &WK)
Definition: agmattr.h:589
static const double Mx
Definition: dt.h:1391
bool IsKey(const TKey &Key) const
Definition: shash.h:1148
double Likelihood(const bool DoParallel=false)
Definition: agmattr.cpp:137
const TKey & GetKey(const int &KeyId) const
Definition: shash.h:1141
int GetNodes() const
Returns the number of nodes in the graph.
Definition: graph.h:192
double LikelihoodAttrKForRow(const int UID, const int K)
Definition: agmattr.h:356
TVec< TIntFltH > F
Definition: agmattr.h:250
TFlt MaxValW
Definition: agmattr.h:264
TPair< TInt, TFlt > TIntFltPr
Definition: ds.h:87
int GetDeg() const
Returns degree of the current node.
Definition: graph.h:90
TFlt MinValW
Definition: agmattr.h:263
const char * GetTmStr() const
Definition: tm.h:370
TInt Attrs
Definition: agmattr.h:252
void Sort(const bool &Asc=true)
Sorts the elements of the vector.
Definition: ds.h:1318
void Gen(const int &ExpectVals)
Definition: hash.h:222
static double Norm2(const TFltV &x)
Definition: linalg.cpp:320
void RandomInit(const int InitComs)
Definition: agmattr.cpp:9
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1229
double LikelihoodHoldOut()
Definition: agmattr.cpp:452
unsigned long long uint64
Definition: bd.h:38
const TVal & GetDat(const TVal &Val) const
Returns reference to the first occurrence of element Val.
Definition: ds.h:838
PUNGraph GetSubGraph(const PUNGraph &Graph, const TIntV &NIdV, const bool &RenumberNodes)
Returns an induced subgraph of an undirected graph Graph with NIdV nodes with an optional node renumb...
Definition: subgraph.cpp:7
double Norm2(const TIntFltH &UV)
Definition: agmattr.h:616
uint GetAbsSecs() const
Definition: tm.h:150
TNodeI GetNI(const int &NId) const
Returns an iterator referring to the node of ID NId in the graph.
Definition: graph.h:241
TVec< TIntSet > X
Definition: agmattr.h:249
TFlt RegCoef
Definition: agmattr.h:255
TVec< TIntSet > HOKIDSV
Definition: agmattr.h:259
TVec< TIntSet > HOVIDSV
Definition: agmattr.h:258
int AddKey(const TKey &Key)
Definition: shash.h:1254
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
void AddCom(const int &NID, const int &CID, const double &Val)
Definition: agmattr.h:541
void GetCmtyVVUnSorted(TVec< TIntV > &CmtyVV)
Definition: agmattr.cpp:329
void InitW()
Definition: agmattr.h:331
static void GenHoldOutPairs(const PGraph &G, TVec< TIntSet > &HoldOutSet, double HOFrac, TRnd &Rnd)
Definition: agmattr.h:38
TRnd Rnd
Definition: agmattr.h:253
void GetCmtyVV(TVec< TIntV > &CmtyVV)
Definition: agmattr.cpp:289
void SetCmtyVV(const TVec< TIntV > &CmtyVV)
Definition: agmattr.cpp:85
void NeighborComInit(const int InitComs)
Definition: agmattr.cpp:32
int Len() const
Definition: shash.h:1121
TStr GetStr() const
Definition: dt.h:1363
double GetStepSizeByLineSearchForWK(const int K, const TFltV &DeltaV, const TFltV &GradV, const double &Alpha, const double &Beta, const int MaxIter=10)
Definition: agmattr.h:486
Definition: dt.h:412
bool Empty() const
Definition: dt.h:491
double LikelihoodForRow(const int UID)
Definition: agmattr.cpp:157
TInt NumComs
Definition: agmattr.h:257
void GradientForRow(const int UID, TIntFltH &GradU, const TIntSet &CIDSet)
Definition: agmattr.cpp:213
double GetUniDev()
Definition: dt.h:30
static TSecTm GetCurTm()
Definition: tm.cpp:697
void SetGraph(const PUNGraph &GraphPt, const THash< TInt, TIntV > &NIDAttrH)
Definition: agmattr.cpp:99
int GetNbrNId(const int &NodeN) const
Returns ID of NodeN-th neighboring node.
Definition: graph.h:111
bool IsNode(const int &NId) const
Tests whether ID NId is a node.
Definition: graph.h:235
void GradientForWK(TFltV &GradV, const int K)
Definition: agmattr.h:421
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:523
static double Log(const double &Val)
Definition: xmath.h:14
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int FindComs(TIntV &ComsV, const bool UseBIC=false, const double HOFrac=0.2, const int NumThreads=20, const TStr PlotLFNm=TStr(), const double StepAlpha=0.3, const double StepBeta=0.1)
Definition: agmattr.cpp:361
char * CStr()
Definition: dt.h:479
bool IsEdge(const int &SrcNId, const int &DstNId) const
Tests whether an edge between node IDs SrcNId and DstNId exists in the graph.
Definition: graph.cpp:137
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelSelfEdges(const PGraph &Graph)
Removes all the self-edges from the graph.
Definition: alg.h:419
int Len() const
Definition: hash.h:228
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
void DelCom(const int &NID, const int &CID)
Definition: agmattr.h:549
TFlt WeightAttr
Definition: agmattr.h:267
THKeyDat * EndI
Definition: hash.h:54
static void PlotValV(const TVec< TVal1 > &ValV, const TStr &OutFNmPref, const TStr &Desc="", const TStr &XLabel="", const TStr &YLabel="", const TGpScaleTy &ScaleTy=gpsAuto, const bool &PowerFit=false, const TGpSeriesTy &SeriesTy=gpwLinesPoints)
Definition: gnuplot.h:398
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:252
static const double Mn
Definition: dt.h:1390
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:430
void SortByDat(const bool &Asc=true)
Definition: hash.h:292