SNAP Library 4.0, User Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
agmattr.cpp
Go to the documentation of this file.
1 
2 #include "stdafx.h"
3 #include "agmattr.h"
4 #include "Snap.h"
5 #include "agm.h"
6 
7 
8 
9 void TCesna::RandomInit(const int InitComs) {
10  F.Gen(G->GetNodes());
11  SumFV.Gen(InitComs);
12  NumComs = InitComs;
13  for (int u = 0; u < F.Len(); u++) {
14  //assign to just one community
15  int Mem = G->GetNI(u).GetDeg();
16  if (Mem > 10) { Mem = 10; }
17  for (int c = 0; c < Mem; c++) {
18  int CID = Rnd.GetUniDevInt(InitComs);
19  AddCom(u, CID, Rnd.GetUniDev());
20  }
21  }
22  //assign a member to zero-member community (if any)
23  for (int c = 0; c < SumFV.Len(); c++) {
24  if (SumFV[c] == 0.0) {
25  int UID = Rnd.GetUniDevInt(G->GetNodes());
26  AddCom(UID, c, Rnd.GetUniDev());
27  }
28  }
29  InitW();
30 }
31 
32 void TCesna::NeighborComInit(const int InitComs) {
33  //initialize with best neighborhood communities (Gleich et.al. KDD'12)
34  TFltIntPrV NIdPhiV(F.Len(), 0);
35  TCesnaUtil::GetNIdPhiV<PUNGraph>(G, NIdPhiV);
36  NeighborComInit(NIdPhiV, InitComs);
37 }
38 
39 void TCesna::NeighborComInit(TFltIntPrV& NIdPhiV, const int InitComs) {
40  //initialize with best neighborhood communities (Gleich et.al. KDD'12)
41  NIdPhiV.Sort(true);
42  F.Gen(G->GetNodes());
43  SumFV.Gen(InitComs);
44  NumComs = InitComs;
45  TIntSet InvalidNIDS(F.Len());
46  TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG
47  //choose nodes with local minimum in conductance
48  int CurCID = 0;
49  for (int ui = 0; ui < NIdPhiV.Len(); ui++) {
50  int UID = NIdPhiV[ui].Val2;
51  fflush(stdout);
52  if (InvalidNIDS.IsKey(UID)) { continue; }
53  ChosenNIDV.Add(UID); //FOR DEBUG
54  //add the node and its neighbors to the current community
55  AddCom(UID, CurCID, 1.0);
56  TUNGraph::TNodeI NI = G->GetNI(UID);
57  fflush(stdout);
58  for (int e = 0; e < NI.GetDeg(); e++) {
59  AddCom(NI.GetNbrNId(e), CurCID, 1.0);
60  }
61  //exclude its neighbors from the next considerations
62  for (int e = 0; e < NI.GetDeg(); e++) {
63  InvalidNIDS.AddKey(NI.GetNbrNId(e));
64  }
65  CurCID++;
66  fflush(stdout);
67  if (CurCID >= NumComs) { break; }
68  }
69  if (NumComs > CurCID) {
70  printf("%d communities needed to fill randomly\n", NumComs - CurCID);
71  }
72  //assign a member to zero-member community (if any)
73  for (int c = 0; c < SumFV.Len(); c++) {
74  if (SumFV[c] == 0.0) {
75  int ComSz = 10;
76  for (int u = 0; u < ComSz; u++) {
77  int UID = Rnd.GetUniDevInt(G->GetNodes());
78  AddCom(UID, c, Rnd.GetUniDev());
79  }
80  }
81  }
82  InitW();
83 }
84 
85 void TCesna::SetCmtyVV(const TVec<TIntV>& CmtyVV) {
86  F.Gen(G->GetNodes());
87  SumFV.Gen(CmtyVV.Len());
88  NumComs = CmtyVV.Len();
89  InitW();
90  for (int c = 0; c < CmtyVV.Len(); c++) {
91  for (int u = 0; u < CmtyVV[c].Len(); u++) {
92  int UID = CmtyVV[c][u];
93  if (! NIDToIdx.IsKey(UID)) { continue; }
94  AddCom(NIDToIdx.GetKeyId(UID), c, 1.0);
95  }
96  }
97 }
98 
99 void TCesna::SetGraph(const PUNGraph& GraphPt, const THash<TInt, TIntV>& NIDAttrH) {
100  HOVIDSV.Gen(GraphPt->GetNodes());
101  HOKIDSV.Gen(GraphPt->GetNodes());
102  X.Gen(GraphPt->GetNodes());
103  TIntV NIDV;
104  GraphPt->GetNIdV(NIDV);
105  NIDToIdx.Gen(NIDV.Len());
106  NIDToIdx.AddKeyV(NIDV);
107  // check that nodes IDs are {0,1,..,Nodes-1}
108  printf("rearrage nodes\n");
109  G = TSnap::GetSubGraph(GraphPt, NIDV, true);
110  for (int nid = 0; nid < G->GetNodes(); nid++) {
111  IAssert(G->IsNode(nid));
112  }
114 
115  PNoCom = 1.0 / (double) G->GetNodes();
116  DoParallel = false;
117  if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow
118  NegWgt = 1.0;
119 
120  // add node attributes, find number of attributes
121  int NumAttr = 0;
122  for (int u = 0; u < NIDAttrH.Len(); u++) {
123  int UID = NIDAttrH.GetKey(u);
124  if (! NIDToIdx.IsKey(UID)) { continue; }
125  X[NIDToIdx.GetKeyId(UID)].Gen(NIDAttrH[u].Len());
126  for (int k = 0; k < NIDAttrH[u].Len(); k++) {
127  int KID = NIDAttrH[u][k];
128  IAssert (KID >= 0);
129  X[NIDToIdx.GetKeyId(UID)].AddKey(KID);
130  if (NumAttr < KID + 1) { NumAttr = KID + 1; }
131  }
132  }
133  Attrs = NumAttr;
134  InitW();
135 }
136 
137 double TCesna::Likelihood(const bool _DoParallel) {
138  TExeTm ExeTm;
139  double L = 0.0;
140  if (_DoParallel) {
141  #pragma omp parallel for
142  for (int u = 0; u < F.Len(); u++) {
143  double LU = LikelihoodForRow(u);
144  #pragma omp atomic
145  L += LU;
146  }
147  }
148  else {
149  for (int u = 0; u < F.Len(); u++) {
150  double LU = LikelihoodForRow(u);
151  L += LU;
152  }
153  }
154  return L;
155 }
156 
157 double TCesna::LikelihoodForRow(const int UID) {
158  return LikelihoodForRow(UID, F[UID]);
159 }
160 
161 double TCesna::LikelihoodForRow(const int UID, const TIntFltH& FU) {
162  double L = 0.0;
163  TFltV HOSumFV; //adjust for Fv of v hold out
164  if (HOVIDSV[UID].Len() > 0) {
165  HOSumFV.Gen(SumFV.Len());
166 
167  for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
168  for (int c = 0; c < SumFV.Len(); c++) {
169  HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
170  }
171  }
172  }
173 
174  TUNGraph::TNodeI NI = G->GetNI(UID);
175  for (int e = 0; e < NI.GetDeg(); e++) {
176  int v = NI.GetNbrNId(e);
177  if (v == UID) { continue; }
178  if (HOVIDSV[UID].IsKey(v)) { continue; }
179  L += log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]);
180  }
181  for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) {
182  double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
183  L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat();
184  }
185  //add regularization
186  if (RegCoef > 0.0) { //L1
187  L -= RegCoef * Sum(FU);
188  }
189  if (RegCoef < 0.0) { //L2
190  L += RegCoef * Norm2(FU);
191  }
192  L *= (1.0 - WeightAttr);
193  // add attribute part
194  for (int k = 0; k < Attrs; k++) {
195  if (HOKIDSV[UID].IsKey(k)) { continue; }
196  L += WeightAttr * LikelihoodAttrKForRow(UID, k, FU);
197  }
198  return L;
199 }
200 
201 
202 double TCesna::LikelihoodAttrKForRow(const int UID, const int K, const TIntFltH& FU, const TFltV& WK) {
203  double Prob = PredictAttrK(FU, WK);
204  double L = 0.0;
205  if (GetAttr(UID, K)) {
206  L = Prob == 0.0? -100.0: log(Prob);
207  } else {
208  L = Prob == 1.0? -100.0: log(1.0 - Prob);
209  }
210  return L;
211 }
212 
213 void TCesna::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) {
214  GradU.Gen(CIDSet.Len());
215  TFltV HOSumFV; //adjust for Fv of v hold out
216  if (HOVIDSV[UID].Len() > 0) {
217  HOSumFV.Gen(SumFV.Len());
218 
219  for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
220  for (int c = 0; c < SumFV.Len(); c++) {
221  HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
222  }
223  }
224  }
225  TUNGraph::TNodeI NI = G->GetNI(UID);
226  int Deg = NI.GetDeg();
227  TFltV PredV(Deg), GradV(CIDSet.Len());
228  TIntV CIDV(CIDSet.Len());
229  for (int e = 0; e < Deg; e++) {
230  if (NI.GetNbrNId(e) == UID) { continue; }
231  if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
232  PredV[e] = Prediction(UID, NI.GetNbrNId(e));
233  }
234 
235  for (int c = 0; c < CIDSet.Len(); c++) {
236  int CID = CIDSet.GetKey(c);
237  double Val = 0.0;
238  for (int e = 0; e < Deg; e++) {
239  int VID = NI.GetNbrNId(e);
240  if (VID == UID) { continue; }
241  if (HOVIDSV[UID].IsKey(VID)) { continue; }
242  Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
243  }
244  double HOSum = HOVIDSV[UID].Len() > 0? HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
245  Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
246  CIDV[c] = CID;
247  GradV[c] = Val;
248  }
249  //add regularization
250  if (RegCoef > 0.0) { //L1
251  for (int c = 0; c < GradV.Len(); c++) {
252  GradV[c] -= RegCoef;
253  }
254  }
255  if (RegCoef < 0.0) { //L2
256  for (int c = 0; c < GradV.Len(); c++) {
257  GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]);
258  }
259  }
260  for (int c = 0; c < GradV.Len(); c++) {
261  GradV[c] *= (1.0 - WeightAttr);
262  }
263  //add attribute part
264  TFltV AttrPredV(Attrs);
265  for (int k = 0; k < Attrs; k++) {
266  if (HOKIDSV[UID].IsKey(k)) { continue; }
267  AttrPredV[k] = PredictAttrK(F[UID], W[k]);
268  }
269  for (int c = 0; c < GradV.Len(); c++) {
270  for (int k = 0; k < Attrs; k++) {
271  if (HOKIDSV[UID].IsKey(k)) { continue; }
272  GradV[c] += WeightAttr * (GetAttr(UID, k) - AttrPredV[k]) * GetW(CIDV[c], k);
273  }
274  }
275 
276 
277  for (int c = 0; c < GradV.Len(); c++) {
278  if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; }
279  if (fabs(GradV[c]) < 0.0001) { continue; }
280  GradU.AddDat(CIDV[c], GradV[c]);
281  }
282  for (int c = 0; c < GradU.Len(); c++) {
283  if (GradU[c] >= 10) { GradU[c] = 10; }
284  if (GradU[c] <= -10) { GradU[c] = -10; }
285  IAssert(GradU[c] >= -10);
286  }
287 }
288 
290  TVec<TFltV> TmpV;
291  GetCmtyVV(CmtyVV, TmpV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3);
292 }
293 
294 
296 void TCesna::GetCmtyVV(TVec<TIntV>& CmtyVV, TVec<TFltV>& Wck, const double Thres, const int MinSz) {
297  CmtyVV.Gen(NumComs, 0);
298  Wck.Gen(NumComs, 0);
299  TIntFltH CIDSumFH(NumComs);
300  for (int c = 0; c < SumFV.Len(); c++) {
301  CIDSumFH.AddDat(c, SumFV[c]);
302  }
303  CIDSumFH.SortByDat(false);
304  for (int c = 0; c < NumComs; c++) {
305  int CID = CIDSumFH.GetKey(c);
306  TIntFltH NIDFucH(F.Len() / 10);
307  TIntV CmtyV;
308  IAssert(SumFV[CID] == CIDSumFH.GetDat(CID));
309  if (SumFV[CID] < Thres) { continue; }
310  for (int u = 0; u < F.Len(); u++) {
311  int NID = NIDToIdx[u];
312  if (GetCom(u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(u, CID)); }
313  }
314  NIDFucH.SortByDat(false);
315  NIDFucH.GetKeyV(CmtyV);
316  if (CmtyV.Len() < MinSz) { continue; }
317  CmtyVV.Add(CmtyV);
318  TFltV WV(Attrs);
319  for (int k = 0; k < Attrs; k++) {
320  WV[k] = GetW(CID, k);
321  }
322  Wck.Add(WV);
323  }
324  if ( NumComs != CmtyVV.Len()) {
325  printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len());
326  }
327 }
328 
330  GetCmtyVVUnSorted(CmtyVV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3);
331 }
332 
333 void TCesna::GetCmtyVVUnSorted(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) {
334  CmtyVV.Gen(NumComs, 0);
335  for (int c = 0; c < NumComs; c++) {
336  TIntV CmtyV;
337  for (int u = 0; u < G->GetNodes(); u++) {
338  if (GetCom(u, c) > Thres) { CmtyV.Add(NIDToIdx[u]); }
339  }
340  if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); }
341  }
342  if ( NumComs != CmtyVV.Len()) {
343  printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len());
344  }
345 }
346 
348 int TCesna::FindComs(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const bool UseBIC, const double HOFrac, const double StepAlpha, const double StepBeta) {
349  double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs);
350  TIntV ComsV;
351  ComsV.Add(MinComs);
352  while (ComsV.Len() < DivComs) {
353  int NewComs = int(ComsV.Last() * ComsGap);
354  if (NewComs == ComsV.Last().Val) { NewComs++; }
355  ComsV.Add(NewComs);
356  }
357  if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); }
358  return FindComs(ComsV, UseBIC, HOFrac, NumThreads, OutFNm, StepAlpha, StepBeta);
359 }
360 
361 int TCesna::FindComs(TIntV& ComsV, const bool UseBIC, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) {
362  if (ComsV.Len() == 0) {
363  int MaxComs = G->GetNodes() / 5;
364  ComsV.Add(2);
365  while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); }
366  }
367  int MaxIterCV = 3;
368 
369  TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV), HoldOutSetsAttr(MaxIterCV);
370  TFltIntPrV NIdPhiV;
371  TCesnaUtil::GetNIdPhiV<PUNGraph>(G, NIdPhiV);
372  if (! UseBIC) { //if edges are many enough, use CV
373  //printf("generating hold out set\n");
374  TIntV NIdV1, NIdV2;
375  G->GetNIdV(NIdV1);
376  G->GetNIdV(NIdV2);
377  for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
378  // generate holdout sets
379  TCesnaUtil::GenHoldOutPairs(G, HoldOutSets[IterCV], HOFrac, Rnd);
380  GenHoldOutAttr(HOFrac, HoldOutSetsAttr[IterCV]);
381  }
382  //printf("hold out set generated\n");
383  }
384 
385  TFltV HOLV(ComsV.Len());
386  TIntFltPrV ComsLV;
387  for (int c = 0; c < ComsV.Len(); c++) {
388  const int Coms = ComsV[c];
389  //printf("Try number of Coms:%d\n", Coms);
390 
391  if (! UseBIC) { //if edges are many enough, use CV
392  for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
393  HOVIDSV = HoldOutSets[IterCV];
394  HOKIDSV = HoldOutSetsAttr[IterCV];
395  NeighborComInit(NIdPhiV, Coms);
396  //printf("Initialized\n");
397 
398  if (NumThreads == 1) {
399  //printf("MLE without parallelization begins\n");
400  MLEGradAscent(0.01, 100 * G->GetNodes(), "", StepAlpha, StepBeta);
401  //printf("likelihood: train:%f, attr:%f, hold:%f\n", Likelihood(), LikelihoodAttr(), LikelihoodHoldOut());
402  } else {
403  //printf("MLE with parallelization begins\n");
404  MLEGradAscentParallel(0.01, 100, NumThreads, "", StepAlpha, StepBeta);
405  }
406  double HOL = LikelihoodHoldOut();
407  HOL = HOL < 0? HOL: TFlt::Mn;
408  HOLV[c] += HOL;
409  }
410  }
411  else {
412  HOVIDSV.Gen(G->GetNodes());
413  HOKIDSV.Gen(G->GetNodes());
414  if (NumThreads == 1) {
415  MLEGradAscent(0.005, 100 * G->GetNodes(), "", StepAlpha, StepBeta);
416  printf("likelihood: train:%f, attr:%f, hold:%f\n", Likelihood(), LikelihoodAttr(), LikelihoodHoldOut());
417  } else {
418  MLEGradAscentParallel(0.005, 100, NumThreads, "", StepAlpha, StepBeta);
419  }
420  //int NumParams = G->GetNodes() * Coms + Coms * Attrs;
421  double NumParams = (1.0 - WeightAttr) * Coms + WeightAttr * Coms * Attrs;
422  double Observations = (1.0 - WeightAttr) * G->GetNodes() * (G->GetNodes() - 1) / 2 + WeightAttr * G->GetNodes() * Attrs;
423  double BIC = 2 * Likelihood() - NumParams * log (Observations);
424  HOLV[c] = BIC;
425  }
426  }
427  int EstComs = 2;
428  double MaxL = TFlt::Mn;
429  if (UseBIC) {
430  printf("Number of communities vs likelihood (criterion: BIC)\n");
431  } else {
432  printf("Number of communities vs likelihood (criterion: Cross validation)\n");
433  }
434  for (int c = 0; c < ComsV.Len(); c++) {
435  ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val));
436  printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val);
437  if (MaxL < HOLV[c]) {
438  MaxL = HOLV[c];
439  EstComs = ComsV[c];
440  }
441  }
442  printf("\n");
443  RandomInit(EstComs);
444  HOVIDSV.Gen(G->GetNodes());
445  HOKIDSV.Gen(G->GetNodes());
446  if (! PlotLFNm.Empty()) {
447  TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood");
448  }
449  return EstComs;
450 }
451 
453  double L = 0.0;
454  for (int u = 0; u < HOVIDSV.Len(); u++) {
455  for (int e = 0; e < HOVIDSV[u].Len(); e++) {
456  int VID = HOVIDSV[u][e];
457  if (VID == u) { continue; }
458  double Pred = Prediction(u, VID);
459  if (G->IsEdge(u, VID)) {
460  L += log(1.0 - Pred);
461  }
462  else {
463  L += NegWgt * log(Pred);
464  }
465  //printf("NODE %d, %d: Dot Prod: %f, Prediction: %f L:%f\n", u, VID, DotProduct(F[u], F[VID]), Pred, L);
466  }
467  }
468  L *= (1.0 - WeightAttr);
469  //printf("likelihood for hold out network:%f\n", L);
470  for (int u = 0; u < HOKIDSV.Len(); u++) {
471  for (int e = 0; e < HOKIDSV[u].Len(); e++) {
472  IAssert(HOKIDSV[u][e] < Attrs);
473  L += WeightAttr * LikelihoodAttrKForRow(u, HOKIDSV[u][e]);
474  //printf("asbefsafd ATTRIBUTE %d, NODE %d:%f\n", u, HOKIDSV[u][e].Val, LikelihoodAttrKForRow(u, HOKIDSV[u][e]));
475  }
476  }
477  return L;
478 }
479 
480 double TCesna::GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) {
481  double StepSize = 1.0;
482  double InitLikelihood = LikelihoodForRow(UID);
483  TIntFltH NewVarV(DeltaV.Len());
484  for(int iter = 0; iter < MaxIter; iter++) {
485  for (int i = 0; i < DeltaV.Len(); i++){
486  int CID = DeltaV.GetKey(i);
487  double NewVal = GetCom(UID, CID) + StepSize * DeltaV.GetDat(CID);
488  if (NewVal < MinVal) { NewVal = MinVal; }
489  if (NewVal > MaxVal) { NewVal = MaxVal; }
490  NewVarV.AddDat(CID, NewVal);
491  }
492  if (LikelihoodForRow(UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) {
493  StepSize *= Beta;
494  } else {
495  break;
496  }
497  if (iter == MaxIter - 1) {
498  StepSize = 0.0;
499  break;
500  }
501  }
502  return StepSize;
503 }
504 
505 int TCesna::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
506  time_t InitTime = time(NULL);
507  TExeTm ExeTm, CheckTm;
508  int iter = 0, PrevIter = 0;
509  TIntFltPrV IterLV;
510  TUNGraph::TNodeI UI;
511  double PrevL = TFlt::Mn, CurL = 0.0;
512  TIntV NIdxV(F.Len(), 0);
513  for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
514  TIntFltH GradV;
515  //consider all C
516  TIntSet CIDSet(NumComs);
517  for (int c = 0; c < NumComs; c++) { CIDSet.AddKey(c); }
518  while(iter < MaxIter) {
519  NIdxV.Shuffle(Rnd);
520  for (int ui = 0; ui < F.Len(); ui++, iter++) {
521  int u = NIdxV[ui]; //
522  /*
523  //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
524  UI = G->GetNI(u);
525  TIntSet CIDSet(20 * UI.GetDeg());
526  for (int e = 0; e < UI.GetDeg(); e++) {
527  if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
528  TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
529  for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
530  CIDSet.AddKey(CI.GetKey());
531  }
532  }
533  for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
534  if (! CIDSet.IsKey(CI.GetKey())) {
535  DelCom(u, CI.GetKey());
536  }
537  }
538  if (CIDSet.Empty()) { continue; }
539  */
540 
541  GradientForRow(u, GradV, CIDSet);
542  if (Norm2(GradV) < 1e-4) { continue; }
543  double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta);
544  if (LearnRate == 0.0) { continue; }
545  for (int ci = 0; ci < GradV.Len(); ci++) {
546  int CID = GradV.GetKey(ci);
547  double Change = LearnRate * GradV.GetDat(CID);
548  double NewFuc = GetCom(u, CID) + Change;
549  if (NewFuc <= 0.0) {
550  DelCom(u, CID);
551  } else {
552  AddCom(u, CID, NewFuc);
553  }
554  }
555  if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) {
556  IterLV.Add(TIntFltPr(iter, Likelihood(false)));
557  }
558  }
559  // fit W (logistic regression)
560  for (int k = 0; k < Attrs; k++) {
561  TFltV GradWV(NumComs);
562  GradientForWK(GradWV, k);
563  if (TLinAlg::Norm2(GradWV) < 1e-4) { continue; }
564  double LearnRate = GetStepSizeByLineSearchForWK(k, GradWV, GradWV, StepAlpha, StepBeta);
565  if (LearnRate == 0.0) { continue; }
566  for (int c = 0; c < GradWV.Len(); c++){
567  W[k][c] += LearnRate * GradWV[c];
568  if (W[k][c] < MinValW) { W[k][c] = MinValW; }
569  if (W[k][c] > MaxValW) { W[k][c] = MaxValW; }
570  }
571  }
572  printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime);
573  fflush(stdout);
574  if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
575  PrevIter = iter;
576  CurL = Likelihood();
577  if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
578  printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL, CurL - PrevL);
579  }
580  fflush(stdout);
581  if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
582  else { PrevL = CurL; }
583  }
584 
585  }
586  printf("\n");
587  printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
588  if (! PlotNm.Empty()) {
589  TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
590  }
591  return iter;
592 }
593 
594 int TCesna::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
595  //parallel
596  time_t InitTime = time(NULL);
597  uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs();
598  TExeTm ExeTm, CheckTm;
599  double PrevL = Likelihood(true);
600  TIntFltPrV IterLV;
601  int PrevIter = 0;
602  int iter = 0;
603  TIntV NIdxV(F.Len(), 0);
604  for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
605  TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization
606  NIDOPTV.PutAll(0);
607  TVec<TIntFltH> NewF(ChunkNum * ChunkSize);
608  TIntV NewNIDV(ChunkNum * ChunkSize);
609  for (iter = 0; iter < MaxIter; iter++) {
610  NIdxV.Clr(false);
611  for (int i = 0; i < F.Len(); i++) {
612  if (NIDOPTV[i] == 0) { NIdxV.Add(i); }
613  }
614  IAssert (NIdxV.Len() <= F.Len());
615  NIdxV.Shuffle(Rnd);
616  // compute gradient for chunk of nodes
617 #pragma omp parallel for schedule(static, 1)
618  for (int TIdx = 0; TIdx < ChunkNum; TIdx++) {
619  TIntFltH GradV;
620  for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) {
621  NewNIDV[ui] = -1;
622  if (ui >= NIdxV.Len()) { continue; }
623  int u = NIdxV[ui]; //
624  //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
625  TUNGraph::TNodeI UI = G->GetNI(u);
626  TIntSet CIDSet(5 * UI.GetDeg());
627  TIntFltH CurFU = F[u];
628  for (int e = 0; e < UI.GetDeg(); e++) {
629  if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
630  TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
631  for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
632  CIDSet.AddKey(CI.GetKey());
633  }
634  }
635  if (CIDSet.Empty()) {
636  CurFU.Clr();
637  }
638  else {
639  for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors
640  if (! CIDSet.IsKey(CI.GetKey())) {
641  CurFU.DelIfKey(CI.GetKey());
642  }
643  }
644  GradientForRow(u, GradV, CIDSet);
645  if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; }
646  double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta);
647  if (LearnRate == 0.0) { NewNIDV[ui] = -2; continue; }
648  for (int ci = 0; ci < GradV.Len(); ci++) {
649  int CID = GradV.GetKey(ci);
650  double Change = LearnRate * GradV.GetDat(CID);
651  double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change;
652  if (NewFuc <= 0.0) {
653  CurFU.DelIfKey(CID);
654  } else {
655  CurFU.AddDat(CID) = NewFuc;
656  }
657  }
658  CurFU.Defrag();
659  }
660  //store changes
661  NewF[ui] = CurFU;
662  NewNIDV[ui] = u;
663  }
664  }
665  int NumNoChangeGrad = 0;
666  int NumNoChangeStepSize = 0;
667  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
668  int NewNID = NewNIDV[ui];
669  if (NewNID == -1) { NumNoChangeGrad++; continue; }
670  if (NewNID == -2) { NumNoChangeStepSize++; continue; }
671  for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
672  SumFV[CI.GetKey()] -= CI.GetDat();
673  }
674  }
675 #pragma omp parallel for
676  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
677  int NewNID = NewNIDV[ui];
678  if (NewNID < 0) { continue; }
679  F[NewNID] = NewF[ui];
680  }
681  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
682  int NewNID = NewNIDV[ui];
683  if (NewNID < 0) { continue; }
684  for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
685  SumFV[CI.GetKey()] += CI.GetDat();
686  }
687  }
688  // update the nodes who are optimal
689  for (int ui = 0; ui < NewNIDV.Len(); ui++) {
690  int NewNID = NewNIDV[ui];
691  if (NewNID < 0) { continue; }
692  TUNGraph::TNodeI UI = G->GetNI(NewNID);
693  NIDOPTV[NewNID] = 0;
694  for (int e = 0; e < UI.GetDeg(); e++) {
695  NIDOPTV[UI.GetNbrNId(e)] = 0;
696  }
697  }
698  int OPTCnt = 0;
699  for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } }
700  // fit W (logistic regression)
701  if (! PlotNm.Empty()) {
702  printf("\r%d iterations [%s] %s secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), TUInt64::GetStr(TSecTm::GetCurTm().GetAbsSecs()-StartTm).CStr());
703  if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); }
704  fflush(stdout);
705  }
706  if (iter == 0 || (iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) {
707  #pragma omp parallel for
708  for (int k = 0; k < Attrs; k++) {
709  TFltV GradWV(NumComs);
710  GradientForWK(GradWV, k);
711  if (TLinAlg::Norm2(GradWV) < 1e-4) { continue; }
712  double LearnRate = GetStepSizeByLineSearchForWK(k, GradWV, GradWV, StepAlpha, StepBeta);
713  if (LearnRate == 0.0) { continue; }
714  for (int c = 0; c < GradWV.Len(); c++){
715  W[k][c] += LearnRate * GradWV[c];
716  if (W[k][c] < MinValW) { W[k][c] = MinValW; }
717  if (W[k][c] > MaxValW) { W[k][c] = MaxValW; }
718  }
719  }
720  PrevIter = iter;
721  double CurL = Likelihood(true);
722  IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL));
723  printf("\r%d iterations, Likelihood: %f, Diff: %f [%lu secs]", iter, CurL, CurL - PrevL, time(NULL) - InitTime);
724  fflush(stdout);
725  if (CurL - PrevL <= Thres * fabs(PrevL)) {
726  break;
727  }
728  else {
729  PrevL = CurL;
730  }
731  }
732  }
733  if (! PlotNm.Empty()) {
734  printf("\nMLE completed with %d iterations(%s secs)\n", iter, TUInt64::GetStr(TSecTm::GetCurTm().GetAbsSecs()-StartTm).CStr());
735  TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
736  } else {
737  printf("\rMLE completed with %d iterations(%lu secs)", iter, time(NULL) - InitTime);
738  fflush(stdout);
739  }
740  return iter;
741 }
int MLEGradAscentParallel(const double &Thres, const int &MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha=0.3, const double StepBeta=0.1)
Definition: agmattr.cpp:594
#define IAssert(Cond)
Definition: bd.h:262
void GenHoldOutAttr(const double HOFrac, TVec< TIntSet > &HOSetV)
Definition: agmattr.h:397
PUNGraph G
Definition: agmattr.h:248
TFltV SumFV
Definition: agmattr.h:256
TFlt MaxVal
Definition: agmattr.h:262
TBool DoParallel
Definition: agmattr.h:269
Definition: tm.h:355
double Sum(const TIntFltH &UV)
Definition: agmattr.h:609
double GetCom(const int &NID, const int &CID)
Definition: agmattr.h:527
void AddKeyV(const TVec< TKey > &KeyV)
Definition: shash.h:1284
TFlt NegWgt
Definition: agmattr.h:265
TVec< TFltV > W
Definition: agmattr.h:251
int Val
Definition: dt.h:1136
int MLEGradAscent(const double &Thres, const int &MaxIter, const TStr PlotNm, const double StepAlpha=0.3, const double StepBeta=0.1)
Definition: agmattr.cpp:505
double Prediction(const TIntFltH &FU, const TIntFltH &FV)
Definition: agmattr.h:584
int GetKeyId(const TKey &Key) const
Definition: shash.h:1328
double DotProduct(const TIntFltH &UV, const TIntFltH &VV)
Definition: agmattr.h:564
void GetNIdV(TIntV &NIdV) const
Gets a vector IDs of all nodes in the graph.
Definition: graph.cpp:153
TFlt PNoCom
Definition: agmattr.h:268
TIter BegI() const
Definition: hash.h:213
TFlt MinVal
Definition: agmattr.h:261
int GetEdges() const
Returns the number of edges in the graph.
Definition: graph.cpp:82
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
Node iterator. Only forward iteration (operator++) is supported.
Definition: graph.h:68
double LikelihoodAttr()
Definition: agmattr.h:371
void GetKeyV(TVec< TKey > &KeyV) const
Definition: shash.h:1347
void Gen(const int &ExpectVals)
Definition: shash.h:1115
double GetAttr(const int &NID, const int &K)
Definition: agmattr.h:534
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:262
TIntSet NIDToIdx
Definition: agmattr.h:254
TIter EndI() const
Definition: hash.h:218
void GetW(TVec< TFltV > &_W)
Definition: agmattr.h:346
double GetStepSizeByLineSearch(const int UID, const TIntFltH &DeltaV, const TIntFltH &GradV, const double &Alpha, const double &Beta, const int MaxIter=10)
Definition: agmattr.cpp:480
double PredictAttrK(const TIntFltH &FU, const TFltV &WK)
Definition: agmattr.h:589
static const double Mx
Definition: dt.h:1388
bool IsKey(const TKey &Key) const
Definition: shash.h:1148
double Likelihood(const bool DoParallel=false)
Definition: agmattr.cpp:137
const TKey & GetKey(const int &KeyId) const
Definition: shash.h:1141
int GetNodes() const
Returns the number of nodes in the graph.
Definition: graph.h:192
double LikelihoodAttrKForRow(const int UID, const int K)
Definition: agmattr.h:356
TVec< TIntFltH > F
Definition: agmattr.h:250
TFlt MaxValW
Definition: agmattr.h:264
TPair< TInt, TFlt > TIntFltPr
Definition: ds.h:87
int GetDeg() const
Returns degree of the current node.
Definition: graph.h:90
TFlt MinValW
Definition: agmattr.h:263
const char * GetTmStr() const
Definition: tm.h:370
TInt Attrs
Definition: agmattr.h:252
void Sort(const bool &Asc=true)
Sorts the elements of the vector.
Definition: ds.h:1318
void Gen(const int &ExpectVals)
Definition: hash.h:222
static double Norm2(const TFltV &x)
Definition: linalg.cpp:320
void RandomInit(const int InitComs)
Definition: agmattr.cpp:9
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1229
double LikelihoodHoldOut()
Definition: agmattr.cpp:452
unsigned long long uint64
Definition: bd.h:38
const TVal & GetDat(const TVal &Val) const
Returns reference to the first occurrence of element Val.
Definition: ds.h:838
PUNGraph GetSubGraph(const PUNGraph &Graph, const TIntV &NIdV, const bool &RenumberNodes)
Returns an induced subgraph of an undirected graph Graph with NIdV nodes with an optional node renumb...
Definition: subgraph.cpp:7
double Norm2(const TIntFltH &UV)
Definition: agmattr.h:616
uint GetAbsSecs() const
Definition: tm.h:150
TNodeI GetNI(const int &NId) const
Returns an iterator referring to the node of ID NId in the graph.
Definition: graph.h:241
TVec< TIntSet > X
Definition: agmattr.h:249
TFlt RegCoef
Definition: agmattr.h:255
TVec< TIntSet > HOKIDSV
Definition: agmattr.h:259
TVec< TIntSet > HOVIDSV
Definition: agmattr.h:258
int AddKey(const TKey &Key)
Definition: shash.h:1254
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
void AddCom(const int &NID, const int &CID, const double &Val)
Definition: agmattr.h:541
void GetCmtyVVUnSorted(TVec< TIntV > &CmtyVV)
Definition: agmattr.cpp:329
void InitW()
Definition: agmattr.h:331
static void GenHoldOutPairs(const PGraph &G, TVec< TIntSet > &HoldOutSet, double HOFrac, TRnd &Rnd)
Definition: agmattr.h:38
TRnd Rnd
Definition: agmattr.h:253
void GetCmtyVV(TVec< TIntV > &CmtyVV)
Definition: agmattr.cpp:289
void SetCmtyVV(const TVec< TIntV > &CmtyVV)
Definition: agmattr.cpp:85
void NeighborComInit(const int InitComs)
Definition: agmattr.cpp:32
int Len() const
Definition: shash.h:1121
TStr GetStr() const
Definition: dt.h:1360
double GetStepSizeByLineSearchForWK(const int K, const TFltV &DeltaV, const TFltV &GradV, const double &Alpha, const double &Beta, const int MaxIter=10)
Definition: agmattr.h:486
Definition: dt.h:412
bool Empty() const
Definition: dt.h:488
double LikelihoodForRow(const int UID)
Definition: agmattr.cpp:157
TInt NumComs
Definition: agmattr.h:257
void GradientForRow(const int UID, TIntFltH &GradU, const TIntSet &CIDSet)
Definition: agmattr.cpp:213
double GetUniDev()
Definition: dt.h:30
static TSecTm GetCurTm()
Definition: tm.cpp:697
void SetGraph(const PUNGraph &GraphPt, const THash< TInt, TIntV > &NIDAttrH)
Definition: agmattr.cpp:99
int GetNbrNId(const int &NodeN) const
Returns ID of NodeN-th neighboring node.
Definition: graph.h:111
bool IsNode(const int &NId) const
Tests whether ID NId is a node.
Definition: graph.h:235
void GradientForWK(TFltV &GradV, const int K)
Definition: agmattr.h:421
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:523
static double Log(const double &Val)
Definition: xmath.h:14
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
int FindComs(TIntV &ComsV, const bool UseBIC=false, const double HOFrac=0.2, const int NumThreads=20, const TStr PlotLFNm=TStr(), const double StepAlpha=0.3, const double StepBeta=0.1)
Definition: agmattr.cpp:361
char * CStr()
Definition: dt.h:476
bool IsEdge(const int &SrcNId, const int &DstNId) const
Tests whether an edge between node IDs SrcNId and DstNId exists in the graph.
Definition: graph.cpp:137
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelSelfEdges(const PGraph &Graph)
Removes all the self-edges from the graph.
Definition: alg.h:419
int Len() const
Definition: hash.h:228
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
void DelCom(const int &NID, const int &CID)
Definition: agmattr.h:549
TFlt WeightAttr
Definition: agmattr.h:267
THKeyDat * EndI
Definition: hash.h:54
static void PlotValV(const TVec< TVal1 > &ValV, const TStr &OutFNmPref, const TStr &Desc="", const TStr &XLabel="", const TStr &YLabel="", const TGpScaleTy &ScaleTy=gpsAuto, const bool &PowerFit=false, const TGpSeriesTy &SeriesTy=gpwLinesPoints)
Definition: gnuplot.h:398
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:252
static const double Mn
Definition: dt.h:1387
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:430
void SortByDat(const bool &Asc=true)
Definition: hash.h:292