SNAP Library 2.2, Developer Reference  2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
agmfit.cpp
Go to the documentation of this file.
00001 #include "stdafx.h"
00002 #include "agmfit.h"
00003 #include "agm.h"
00004 
00006 // AGM fitting
00007 
00008 void TAGMFit::Save(TSOut& SOut) {
00009   G->Save(SOut);
00010   CIDNSetV.Save(SOut);
00011   EdgeComVH.Save(SOut);
00012   NIDComVH.Save(SOut);
00013   ComEdgesV.Save(SOut);
00014   PNoCom.Save(SOut);
00015   LambdaV.Save(SOut);
00016   NIDCIDPrH.Save(SOut);
00017   NIDCIDPrS.Save(SOut);
00018   MinLambda.Save(SOut);
00019   MaxLambda.Save(SOut);
00020   RegCoef.Save(SOut);
00021   BaseCID.Save(SOut);
00022 }
00023 
00024 void TAGMFit::Load(TSIn& SIn, const int& RndSeed) {
00025   G = TUNGraph::Load(SIn);
00026   CIDNSetV.Load(SIn);
00027   EdgeComVH.Load(SIn);
00028   NIDComVH.Load(SIn);
00029   ComEdgesV.Load(SIn);
00030   PNoCom.Load(SIn);
00031   LambdaV.Load(SIn);
00032   NIDCIDPrH.Load(SIn);
00033   NIDCIDPrS.Load(SIn);
00034   MinLambda.Load(SIn);
00035   MaxLambda.Load(SIn);
00036   RegCoef.Load(SIn);
00037   BaseCID.Load(SIn);
00038   Rnd.PutSeed(RndSeed);
00039 }
00040 
00041 // Randomly initialize bipartite community affiliation graph.
00042 void TAGMFit::RandomInitCmtyVV(const int InitComs, const double ComSzAlpha, const double MemAlpha, const int MinComSz, const int MaxComSz, const int MinMem, const int MaxMem) {
00043   TVec<TIntV> InitCmtyVV(InitComs, 0);
00044   TAGMUtil::GenCmtyVVFromPL(InitCmtyVV, G, G->GetNodes(), InitComs, ComSzAlpha, MemAlpha, MinComSz, MaxComSz,
00045       MinMem, MaxMem, Rnd);
00046   SetCmtyVV(InitCmtyVV);
00047 }
00048 
00049 // For each (u, v) in edges, precompute C_uv (the set of communities u and v share).
00050 void TAGMFit::GetEdgeJointCom() {
00051   ComEdgesV.Gen(CIDNSetV.Len());
00052   EdgeComVH.Gen(G->GetEdges());
00053   for (TUNGraph::TNodeI SrcNI = G->BegNI(); SrcNI < G->EndNI(); SrcNI++) {
00054     int SrcNID = SrcNI.GetId();
00055     for (int v = 0; v < SrcNI.GetDeg(); v++) {
00056       int DstNID = SrcNI.GetNbrNId(v);
00057       if (SrcNID >= DstNID) { continue; }
00058       TIntSet JointCom;
00059       IAssert(NIDComVH.IsKey(SrcNID));
00060       IAssert(NIDComVH.IsKey(DstNID));
00061       TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID), NIDComVH.GetDat(DstNID), JointCom);
00062       EdgeComVH.AddDat(TIntPr(SrcNID,DstNID),JointCom);
00063       for (int k = 0; k < JointCom.Len(); k++) {
00064         ComEdgesV[JointCom[k]]++;
00065       }
00066     }
00067   }
00068   IAssert(EdgeComVH.Len() == G->GetEdges());
00069 }
00070 
00071 // Set epsilon by the default value.
00072 void TAGMFit::SetDefaultPNoCom() {
00073   PNoCom = 1.0 / (double) G->GetNodes() / (double) G->GetNodes();
00074 }
00075 
00076 double TAGMFit::Likelihood(const TFltV& NewLambdaV, double& LEdges, double& LNoEdges) {
00077   IAssert(CIDNSetV.Len() == NewLambdaV.Len());
00078   IAssert(ComEdgesV.Len() == CIDNSetV.Len());
00079   LEdges = 0.0; LNoEdges = 0.0;
00080   for (int e = 0; e < EdgeComVH.Len(); e++) {
00081     TIntSet& JointCom = EdgeComVH[e];
00082     double LambdaSum = SelectLambdaSum(NewLambdaV, JointCom);
00083     double Puv = 1 - exp(- LambdaSum);
00084     if (JointCom.Len() == 0) {  Puv = PNoCom;  }
00085     IAssert(! _isnan(log(Puv)));
00086     LEdges += log(Puv);
00087   }
00088   for (int k = 0; k < NewLambdaV.Len(); k++) {
00089     int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2;
00090     int NotEdgesInCom = MaxEk - ComEdgesV[k];
00091     if(NotEdgesInCom > 0) {
00092       if (LNoEdges >= TFlt::Mn + double(NotEdgesInCom) * NewLambdaV[k]) { 
00093         LNoEdges -= double(NotEdgesInCom) * NewLambdaV[k];
00094       }
00095     }
00096   }
00097   double LReg = 0.0;
00098   if (RegCoef > 0.0) {
00099     LReg = - RegCoef * TLinAlg::SumVec(NewLambdaV);
00100   }
00101   return LEdges + LNoEdges + LReg;
00102 }
00103 
00104 double TAGMFit::Likelihood() { 
00105   return Likelihood(LambdaV); 
00106 }
00107 
00108 // Step size search for updating P_c (which is parametarized by lambda).
00109 double TAGMFit::GetStepSizeByLineSearchForLambda(const TFltV& DeltaV, const TFltV& GradV, const double& Alpha, const double& Beta) {
00110   double StepSize = 1.0;
00111   double InitLikelihood = Likelihood();
00112   IAssert(LambdaV.Len() == DeltaV.Len());
00113   TFltV NewLambdaV(LambdaV.Len());
00114   for (int iter = 0; ; iter++) {
00115     for (int i = 0; i < LambdaV.Len(); i++) {
00116       NewLambdaV[i] = LambdaV[i] + StepSize * DeltaV[i];
00117       if (NewLambdaV[i] < MinLambda) { NewLambdaV[i] = MinLambda; }
00118       if (NewLambdaV[i] > MaxLambda) { NewLambdaV[i] = MaxLambda; }
00119     }
00120     if (Likelihood(NewLambdaV) < InitLikelihood + Alpha * StepSize * TLinAlg::DotProduct(GradV, DeltaV)) {
00121       StepSize *= Beta;
00122     } else {
00123       break;
00124     }
00125   }
00126   return StepSize;
00127 }
00128 
00129 // Gradient descent for p_c while fixing community affiliation graph (CAG).
00130 int TAGMFit::MLEGradAscentGivenCAG(const double& Thres, const int& MaxIter, const TStr PlotNm) {
00131   int Edges = G->GetEdges();
00132   TExeTm ExeTm;
00133   TFltV GradV(LambdaV.Len());
00134   int iter = 0;
00135   TIntFltPrV IterLV, IterGradNormV;
00136   double GradCutOff = 1000;
00137   for (iter = 0; iter < MaxIter; iter++) {
00138     GradLogLForLambda(GradV);    //if gradient is going out of the boundary, cut off
00139     for (int i = 0; i < LambdaV.Len(); i++) {
00140       if (GradV[i] < -GradCutOff) { GradV[i] = -GradCutOff; }
00141       if (GradV[i] > GradCutOff) { GradV[i] = GradCutOff; }
00142       if (LambdaV[i] <= MinLambda && GradV[i] < 0) { GradV[i] = 0.0; }
00143       if (LambdaV[i] >= MaxLambda && GradV[i] > 0) { GradV[i] = 0.0; }
00144     }
00145     double Alpha = 0.15, Beta = 0.2;
00146     if (Edges > Kilo(100)) { Alpha = 0.00015; Beta = 0.3;}
00147     double LearnRate = GetStepSizeByLineSearchForLambda(GradV, GradV, Alpha, Beta);
00148     if (TLinAlg::Norm(GradV) < Thres) { break; }
00149     for (int i = 0; i < LambdaV.Len(); i++) {
00150       double Change = LearnRate * GradV[i];
00151       LambdaV[i] += Change;
00152       if(LambdaV[i] < MinLambda) { LambdaV[i] = MinLambda;}
00153       if(LambdaV[i] > MaxLambda) { LambdaV[i] = MaxLambda;}
00154     }
00155     if (! PlotNm.Empty()) {
00156       double L = Likelihood();
00157       IterLV.Add(TIntFltPr(iter, L));
00158       IterGradNormV.Add(TIntFltPr(iter, TLinAlg::Norm(GradV)));
00159     }
00160   }
00161   if (! PlotNm.Empty()) {
00162     TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
00163     TGnuPlot::PlotValV(IterGradNormV, PlotNm + ".gradnorm_Q");
00164     printf("MLE for Lambda completed with %d iterations(%s)\n",iter,ExeTm.GetTmStr());
00165   }
00166   return iter;
00167 }
00168 
00169 void TAGMFit::RandomInit(const int& MaxK) {
00170   CIDNSetV.Clr();
00171   for (int c = 0; c < MaxK; c++) {
00172     CIDNSetV.Add();
00173     int NC = Rnd.GetUniDevInt(G -> GetNodes());
00174     TUNGraph::TNodeI NI = G -> GetRndNI();
00175     CIDNSetV.Last().AddKey(NI.GetId());
00176     for (int v = 0; v < NC; v++) {
00177       NI = G->GetNI(NI.GetNbrNId(Rnd.GetUniDevInt(NI.GetDeg())));
00178       CIDNSetV.Last().AddKey(NI.GetId());
00179     }
00180   }
00181   InitNodeData();
00182   SetDefaultPNoCom();
00183 }
00184 
00185 // Initialize node community memberships using best neighborhood communities (see D. Gleich et al. KDD'12).
00186 void TAGMFit::NeighborComInit(const int InitComs) {
00187   CIDNSetV.Gen(InitComs);
00188   const int Edges = G->GetEdges();
00189   TFltIntPrV NIdPhiV(G->GetNodes(), 0);
00190   TIntSet InvalidNIDS(G->GetNodes());
00191   TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG
00192   TExeTm RunTm;
00193   //compute conductance of neighborhood community
00194   TIntV NIdV;
00195   G->GetNIdV(NIdV);
00196   for (int u = 0; u < NIdV.Len(); u++) {
00197     TIntSet NBCmty(G->GetNI(NIdV[u]).GetDeg() + 1);
00198     double Phi;
00199     if (G->GetNI(NIdV[u]).GetDeg() < 5) { //do not include nodes with too few degree
00200       Phi = 1.0; 
00201     } else {
00202       TAGMUtil::GetNbhCom(G, NIdV[u], NBCmty);
00203       IAssert(NBCmty.Len() == G->GetNI(NIdV[u]).GetDeg() + 1);
00204       Phi = TAGMUtil::GetConductance(G, NBCmty, Edges);
00205     }
00206     NIdPhiV.Add(TFltIntPr(Phi, NIdV[u]));
00207   }
00208   NIdPhiV.Sort(true);
00209   printf("conductance computation completed [%s]\n", RunTm.GetTmStr());
00210   fflush(stdout);
00211   //choose nodes with local minimum in conductance
00212   int CurCID = 0;
00213   for (int ui = 0; ui < NIdPhiV.Len(); ui++) {
00214     int UID = NIdPhiV[ui].Val2;
00215     fflush(stdout);
00216     if (InvalidNIDS.IsKey(UID)) { continue; }
00217     ChosenNIDV.Add(UID); //FOR DEBUG
00218     //add the node and its neighbors to the current community
00219     CIDNSetV[CurCID].AddKey(UID);
00220     TUNGraph::TNodeI NI = G->GetNI(UID);
00221     fflush(stdout);
00222     for (int e = 0; e < NI.GetDeg(); e++) {
00223       CIDNSetV[CurCID].AddKey(NI.GetNbrNId(e));
00224     }
00225     //exclude its neighbors from the next considerations
00226     for (int e = 0; e < NI.GetDeg(); e++) {
00227       InvalidNIDS.AddKey(NI.GetNbrNId(e));
00228     }
00229     CurCID++;
00230     fflush(stdout);
00231     if (CurCID >= InitComs) { break;  }
00232   }
00233   if (InitComs > CurCID) {
00234     printf("%d communities needed to fill randomly\n", InitComs - CurCID);
00235   }
00236   //assign a member to zero-member community (if any)
00237   for (int c = 0; c < CIDNSetV.Len(); c++) {
00238     if (CIDNSetV[c].Len() == 0) {
00239       int ComSz = 10;
00240       for (int u = 0; u < ComSz; u++) {
00241         int UID = G->GetRndNI().GetId();
00242         CIDNSetV[c].AddKey(UID);
00243       }
00244     }
00245   }
00246   InitNodeData();
00247   SetDefaultPNoCom();
00248 }
00249 
00250 // Add epsilon community (base community which includes all nodes) into community affiliation graph. It means that we fit for epsilon.
00251 void TAGMFit::AddBaseCmty() {
00252   TVec<TIntV> CmtyVV;
00253   GetCmtyVV(CmtyVV);
00254   TIntV TmpV = CmtyVV[0];
00255   CmtyVV.Add(TmpV);
00256   G->GetNIdV(CmtyVV[0]);
00257   IAssert(CIDNSetV.Len() + 1 == CmtyVV.Len());
00258   SetCmtyVV(CmtyVV);
00259   InitNodeData();
00260   BaseCID = 0;
00261   PNoCom = 0.0;
00262 }
00263 
00264 void TAGMFit::InitNodeData() {
00265   TSnap::DelSelfEdges(G);
00266   NIDComVH.Gen(G->GetNodes());
00267   for (TUNGraph::TNodeI NI = G->BegNI(); NI < G->EndNI(); NI++) {
00268     NIDComVH.AddDat(NI.GetId());
00269   }
00270   TAGMUtil::GetNodeMembership(NIDComVH, CIDNSetV);
00271   GetEdgeJointCom();
00272   LambdaV.Gen(CIDNSetV.Len());
00273   for (int c = 0; c < CIDNSetV.Len(); c++) {
00274     int MaxE = (CIDNSetV[c].Len()) * (CIDNSetV[c].Len() - 1) / 2;
00275     if (MaxE < 2) {
00276       LambdaV[c] = MaxLambda;
00277     }
00278     else{
00279       LambdaV[c] = -log((double) (MaxE - ComEdgesV[c]) / MaxE);
00280     }
00281     if (LambdaV[c] > MaxLambda) {  LambdaV[c] = MaxLambda;  }
00282     if (LambdaV[c] < MinLambda) {  LambdaV[c] = MinLambda;  }
00283   }
00284   NIDCIDPrS.Gen(G->GetNodes() * 10);
00285   for (int c = 0; c < CIDNSetV.Len(); c++) {
00286     for (TIntSet::TIter SI = CIDNSetV[c].BegI(); SI < CIDNSetV[c].EndI(); SI++) {
00287       NIDCIDPrS.AddKey(TIntPr(SI.GetKey(), c));
00288     }
00289   }
00290 }
00291 
00292 // After MCMC, NID leaves community CID.
00293 void TAGMFit::LeaveCom(const int& NID, const int& CID) {
00294   TUNGraph::TNodeI NI = G->GetNI(NID);
00295   for (int e = 0; e < NI.GetDeg(); e++) {
00296     int VID = NI.GetNbrNId(e);
00297     if (NIDComVH.GetDat(VID).IsKey(CID)) {
00298       TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID));
00299       EdgeComVH.GetDat(SrcDstNIDPr).DelKey(CID);
00300       ComEdgesV[CID]--;
00301     }
00302   }
00303   CIDNSetV[CID].DelKey(NID);
00304   NIDComVH.GetDat(NID).DelKey(CID);
00305   NIDCIDPrS.DelKey(TIntPr(NID, CID));
00306 }
00307 
00308 // After MCMC, NID joins community CID.
00309 void TAGMFit::JoinCom(const int& NID, const int& JoinCID) {
00310   TUNGraph::TNodeI NI = G->GetNI(NID);
00311   for (int e = 0; e < NI.GetDeg(); e++) {
00312     int VID = NI.GetNbrNId(e);
00313     if (NIDComVH.GetDat(VID).IsKey(JoinCID)) {
00314       TIntPr SrcDstNIDPr = TIntPr(TMath::Mn(NID,VID), TMath::Mx(NID,VID));
00315       EdgeComVH.GetDat(SrcDstNIDPr).AddKey(JoinCID);
00316       ComEdgesV[JoinCID]++;
00317     }
00318   }
00319   CIDNSetV[JoinCID].AddKey(NID);
00320   NIDComVH.GetDat(NID).AddKey(JoinCID);
00321   NIDCIDPrS.AddKey(TIntPr(NID, JoinCID));
00322 }
00323 
00324 // Sample transition: Choose among (join, leave, switch), and then sample (NID, CID).
00325 void TAGMFit::SampleTransition(int& NID, int& JoinCID, int& LeaveCID, double& DeltaL) {
00326   int Option = Rnd.GetUniDevInt(3); //0:Join 1:Leave 2:Switch
00327   if (NIDCIDPrS.Len() <= 1) {    Option = 0;  } //if there is only one node membership, only join is possible.
00328   int TryCnt = 0;
00329   const int MaxTryCnt = G->GetNodes();
00330   DeltaL = TFlt::Mn;
00331   if (Option == 0) {
00332     do {
00333       JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len());
00334       NID = G->GetRndNId();
00335     } while (TryCnt++ < MaxTryCnt && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID)));
00336     if (TryCnt < MaxTryCnt) { //if successfully find a move
00337       DeltaL = SeekJoin(NID, JoinCID);
00338     }
00339   }
00340   else if (Option == 1) {
00341     do {
00342       TIntPr NIDCIDPr = NIDCIDPrS.GetKey(NIDCIDPrS.GetRndKeyId(Rnd));
00343       NID = NIDCIDPr.Val1;
00344       LeaveCID = NIDCIDPr.Val2;
00345     } while (TryCnt++ < MaxTryCnt && LeaveCID == BaseCID);
00346     if (TryCnt < MaxTryCnt) {//if successfully find a move
00347       DeltaL = SeekLeave(NID, LeaveCID);
00348     }
00349   }
00350   else{
00351     do {
00352       TIntPr NIDCIDPr = NIDCIDPrS.GetKey(NIDCIDPrS.GetRndKeyId(Rnd));
00353       NID = NIDCIDPr.Val1;
00354       LeaveCID = NIDCIDPr.Val2;
00355     } while (TryCnt++ < MaxTryCnt && (NIDComVH.GetDat(NID).Len() == CIDNSetV.Len() || LeaveCID == BaseCID));
00356     do {
00357       JoinCID = Rnd.GetUniDevInt(CIDNSetV.Len());
00358     } while (TryCnt++ < G->GetNodes() && NIDCIDPrS.IsKey(TIntPr(NID, JoinCID)));
00359     if (TryCnt < MaxTryCnt) {//if successfully find a move
00360       DeltaL = SeekSwitch(NID, LeaveCID, JoinCID);
00361     }
00362   }
00363 }
00364 
00366 void TAGMFit::RunMCMC(const int& MaxIter, const int& EvalLambdaIter, const TStr& PlotFPrx) {
00367   TExeTm IterTm, TotalTm;
00368   double PrevL = Likelihood(), DeltaL = 0;
00369   double BestL = PrevL;
00370   printf("initial likelihood = %f\n",PrevL);
00371   TIntFltPrV IterTrueLV, IterJoinV, IterLeaveV, IterAcceptV, IterSwitchV, IterLBV;
00372   TIntPrV IterTotMemV;
00373   TIntV IterV;
00374   TFltV BestLV;
00375   TVec<TIntSet> BestCmtySetV;
00376   int SwitchCnt = 0, LeaveCnt = 0, JoinCnt = 0, AcceptCnt = 0, ProbBinSz;
00377   int Nodes = G->GetNodes(), Edges = G->GetEdges();
00378   TExeTm PlotTm;
00379   ProbBinSz = TMath::Mx(1000, G->GetNodes() / 10); //bin to compute probabilities
00380   IterLBV.Add(TIntFltPr(1, BestL));
00381 
00382   for (int iter = 0; iter < MaxIter; iter++) {
00383     IterTm.Tick();
00384     int NID = -1;
00385     int JoinCID = -1, LeaveCID = -1;
00386     SampleTransition(NID, JoinCID, LeaveCID, DeltaL); //sample a move
00387     double OptL = PrevL;
00388     if (DeltaL > 0 || Rnd.GetUniDev() < exp(DeltaL)) { //if it is accepted
00389       IterTm.Tick();
00390       if (LeaveCID > -1 && LeaveCID != BaseCID) { LeaveCom(NID, LeaveCID); }
00391       if (JoinCID > -1 && JoinCID != BaseCID) { JoinCom(NID, JoinCID); }
00392       if (LeaveCID > -1 && JoinCID > -1 && JoinCID != BaseCID && LeaveCID != BaseCID) { SwitchCnt++; }
00393       else if (LeaveCID > -1  && LeaveCID != BaseCID) { LeaveCnt++;}
00394       else if (JoinCID > -1 && JoinCID != BaseCID) { JoinCnt++;}
00395       AcceptCnt++;
00396       if ((iter + 1) % EvalLambdaIter == 0) {
00397         IterTm.Tick();
00398         MLEGradAscentGivenCAG(0.01, 3);
00399         OptL = Likelihood();
00400       }
00401       else{
00402         OptL = PrevL + DeltaL;
00403       }
00404       if (BestL <= OptL && CIDNSetV.Len() > 0) {
00405         BestCmtySetV = CIDNSetV;
00406         BestLV = LambdaV;
00407         BestL = OptL;
00408       }
00409     }
00410     if (iter > 0 && (iter % ProbBinSz == 0) && PlotFPrx.Len() > 0) { 
00411       IterLBV.Add(TIntFltPr(iter, OptL));
00412       IterSwitchV.Add(TIntFltPr(iter, (double) SwitchCnt / (double) AcceptCnt));
00413       IterLeaveV.Add(TIntFltPr(iter, (double) LeaveCnt / (double) AcceptCnt));
00414       IterJoinV.Add(TIntFltPr(iter, (double) JoinCnt / (double) AcceptCnt));
00415       IterAcceptV.Add(TIntFltPr(iter, (double) AcceptCnt / (double) ProbBinSz));
00416       SwitchCnt = JoinCnt = LeaveCnt = AcceptCnt = 0;
00417     }
00418     PrevL = OptL;
00419     if ((iter + 1) % 10000 == 0) {
00420       printf("\r%d iterations completed [%.2f]", iter, (double) iter / (double) MaxIter);
00421     }
00422   }
00423   
00424   // plot the likelihood and acceptance probabilities if the plot file name is given
00425   if (PlotFPrx.Len() > 0) {
00426     TGnuPlot GP1;
00427     GP1.AddPlot(IterLBV, gpwLinesPoints, "likelihood");
00428     GP1.SetDataPlotFNm(PlotFPrx + ".likelihood.tab", PlotFPrx + ".likelihood.plt");
00429     TStr TitleStr = TStr::Fmt(" N:%d E:%d", Nodes, Edges);
00430     GP1.SetTitle(PlotFPrx + ".likelihood" + TitleStr);
00431     GP1.SavePng(PlotFPrx + ".likelihood.png");
00432 
00433     TGnuPlot GP2;
00434     GP2.AddPlot(IterSwitchV, gpwLinesPoints, "Switch");
00435     GP2.AddPlot(IterLeaveV, gpwLinesPoints, "Leave");
00436     GP2.AddPlot(IterJoinV, gpwLinesPoints, "Join");
00437     GP2.AddPlot(IterAcceptV, gpwLinesPoints, "Accept");
00438     GP2.SetTitle(PlotFPrx + ".transition");
00439     GP2.SetDataPlotFNm(PlotFPrx + "transition_prob.tab", PlotFPrx + "transition_prob.plt");
00440     GP2.SavePng(PlotFPrx + "transition_prob.png");
00441   }
00442   CIDNSetV = BestCmtySetV;
00443   LambdaV = BestLV;
00444 
00445   InitNodeData();
00446   MLEGradAscentGivenCAG(0.001, 100);
00447   printf("\nMCMC completed (best likelihood: %.2f) [%s]\n", BestL, TotalTm.GetTmStr());
00448 }
00449 
00450 // Returns \v QV, a vector of (1 - p_c) for each community c.
00451 void TAGMFit::GetQV(TFltV& OutV) {
00452   OutV.Gen(LambdaV.Len());
00453   for (int i = 0; i < LambdaV.Len(); i++) {
00454     OutV[i] = exp(- LambdaV[i]);
00455   }
00456 }
00457 
00458 // Remove empty communities.
00459 int TAGMFit::RemoveEmptyCom() {
00460   int DelCnt = 0;
00461   for (int c = 0; c < CIDNSetV.Len(); c++) {
00462     if (CIDNSetV[c].Len() == 0) {
00463       CIDNSetV[c] = CIDNSetV.Last();
00464       CIDNSetV.DelLast();
00465       LambdaV[c] = LambdaV.Last();
00466       LambdaV.DelLast();
00467       DelCnt++;
00468       c--;
00469     }
00470   }
00471   return DelCnt;
00472 }
00473 
00474 // Compute the change in likelihood (Delta) if node UID leaves community CID.
00475 double TAGMFit::SeekLeave(const int& UID, const int& CID) {
00476   IAssert(CIDNSetV[CID].IsKey(UID));
00477   IAssert(G->IsNode(UID));
00478   double Delta = 0.0;
00479   TUNGraph::TNodeI NI = G->GetNI(UID);
00480   int NbhsInC = 0;
00481   for (int e = 0; e < NI.GetDeg(); e++) {
00482     const int VID = NI.GetNbrNId(e);
00483     if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; }
00484     TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
00485     TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
00486     double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom);
00487     CurPuv = 1 - exp(- LambdaSum);
00488     NewPuv = 1 - exp(- LambdaSum + LambdaV[CID]);
00489     IAssert(JointCom.Len() > 0);
00490     if (JointCom.Len() == 1) {
00491       NewPuv = PNoCom;
00492     }
00493     Delta += (log(NewPuv) - log(CurPuv));
00494     IAssert(!_isnan(Delta));
00495     NbhsInC++;
00496   }
00497   Delta += LambdaV[CID] * (CIDNSetV[CID].Len() - 1 - NbhsInC);
00498   return Delta;
00499 }
00500 
00501 // Compute the change in likelihood (Delta) if node UID joins community CID.
00502 double TAGMFit::SeekJoin(const int& UID, const int& CID) {
00503   IAssert(! CIDNSetV[CID].IsKey(UID));
00504   double Delta = 0.0;
00505   TUNGraph::TNodeI NI = G->GetNI(UID);
00506   int NbhsInC = 0;
00507   for (int e = 0; e < NI.GetDeg(); e++) {
00508     const int VID = NI.GetNbrNId(e);
00509     if (! NIDComVH.GetDat(VID).IsKey(CID)) { continue; }
00510     TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
00511     TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
00512     double CurPuv, NewPuv, LambdaSum = SelectLambdaSum(JointCom);
00513     CurPuv = 1 - exp(- LambdaSum);
00514     if (JointCom.Len() == 0) { CurPuv = PNoCom; }
00515     NewPuv = 1 - exp(- LambdaSum - LambdaV[CID]);
00516     Delta += (log(NewPuv) - log(CurPuv));
00517     IAssert(!_isnan(Delta));
00518     NbhsInC++;
00519   }
00520   Delta -= LambdaV[CID] * (CIDNSetV[CID].Len() - NbhsInC);
00521   return Delta;
00522 }
00523 
00524 // Compute the change in likelihood (Delta) if node UID switches from CurCID to NewCID.
00525 double TAGMFit::SeekSwitch(const int& UID, const int& CurCID, const int& NewCID) {
00526   IAssert(! CIDNSetV[NewCID].IsKey(UID));
00527   IAssert(CIDNSetV[CurCID].IsKey(UID));
00528   double Delta = SeekJoin(UID, NewCID) + SeekLeave(UID, CurCID);
00529   //correct only for intersection between new com and current com
00530   TUNGraph::TNodeI NI = G->GetNI(UID);
00531   for (int e = 0; e < NI.GetDeg(); e++) {
00532     const int VID = NI.GetNbrNId(e);
00533     if (! NIDComVH.GetDat(VID).IsKey(CurCID) || ! NIDComVH.GetDat(VID).IsKey(NewCID)) {continue;}
00534     TIntPr SrcDstNIDPr(TMath::Mn(UID,VID), TMath::Mx(UID,VID));
00535     TIntSet& JointCom = EdgeComVH.GetDat(SrcDstNIDPr);
00536     double CurPuv, NewPuvAfterJoin, NewPuvAfterLeave, NewPuvAfterSwitch, LambdaSum = SelectLambdaSum(JointCom);
00537     CurPuv = 1 - exp(- LambdaSum);
00538     NewPuvAfterLeave = 1 - exp(- LambdaSum + LambdaV[CurCID]);
00539     NewPuvAfterJoin = 1 - exp(- LambdaSum - LambdaV[NewCID]);
00540     NewPuvAfterSwitch = 1 - exp(- LambdaSum - LambdaV[NewCID] + LambdaV[CurCID]);
00541     if (JointCom.Len() == 1 || NewPuvAfterLeave == 0.0) {
00542       NewPuvAfterLeave = PNoCom;
00543     }
00544     Delta += (log(NewPuvAfterSwitch) + log(CurPuv) - log(NewPuvAfterLeave) - log(NewPuvAfterJoin));
00545     if (_isnan(Delta)) {
00546       printf("NS:%f C:%f NL:%f NJ:%f PNoCom:%f", NewPuvAfterSwitch, CurPuv, NewPuvAfterLeave, NewPuvAfterJoin, PNoCom.Val);
00547     }
00548     IAssert(!_isnan(Delta));
00549   }
00550   return Delta;
00551 }
00552 
00553 // Get communities whose p_c is higher than 1 - QMax.
00554 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, const double QMax) {
00555   TFltV TmpQV;
00556   GetCmtyVV(CmtyVV, TmpQV, QMax);
00557 }
00558 
00559 void TAGMFit::GetCmtyVV(TVec<TIntV>& CmtyVV, TFltV& QV, const double QMax) {
00560   CmtyVV.Gen(CIDNSetV.Len(), 0);
00561   QV.Gen(CIDNSetV.Len(), 0);
00562   TIntFltH CIDLambdaH(CIDNSetV.Len());
00563   for (int c = 0; c < CIDNSetV.Len(); c++) {
00564     CIDLambdaH.AddDat(c, LambdaV[c]);
00565   }
00566   CIDLambdaH.SortByDat(false);
00567   for (int c = 0; c < CIDNSetV.Len(); c++) {
00568     int CID = CIDLambdaH.GetKey(c);
00569     IAssert(LambdaV[CID] >= MinLambda);
00570     double Q = exp( - (double) LambdaV[CID]);
00571     if (Q > QMax) { continue; }
00572     TIntV CmtyV;
00573     CIDNSetV[CID].GetKeyV(CmtyV);
00574     if (CmtyV.Len() == 0) { continue; }
00575     if (CID == BaseCID) { //if the community is the base community(epsilon community), discard
00576       IAssert(CmtyV.Len() == G->GetNodes());
00577     } else {
00578       CmtyVV.Add(CmtyV);
00579       QV.Add(Q);
00580     }
00581   }
00582 }
00583 
00584 void TAGMFit::SetCmtyVV(const TVec<TIntV>& CmtyVV) {
00585   CIDNSetV.Gen(CmtyVV.Len());
00586   for (int c = 0; c < CIDNSetV.Len(); c++) {
00587     CIDNSetV[c].AddKeyV(CmtyVV[c]);
00588     for (int j = 0; j < CmtyVV[c].Len(); j++) { IAssert(G->IsNode(CmtyVV[c][j])); } // check whether the member nodes exist in the graph
00589   }
00590   InitNodeData();
00591   SetDefaultPNoCom();
00592 }
00593 
00594 // Gradient of likelihood for P_c.
00595 void TAGMFit::GradLogLForLambda(TFltV& GradV) {
00596   GradV.Gen(LambdaV.Len());
00597   TFltV SumEdgeProbsV(LambdaV.Len());
00598   for (int e = 0; e < EdgeComVH.Len(); e++) {
00599     TIntSet& JointCom = EdgeComVH[e];
00600     double LambdaSum = SelectLambdaSum(JointCom);
00601     double Puv = 1 - exp(- LambdaSum);
00602     if (JointCom.Len() == 0) {  Puv = PNoCom;  }
00603     for (TIntSet::TIter SI = JointCom.BegI(); SI < JointCom.EndI(); SI++) {
00604       SumEdgeProbsV[SI.GetKey()] += (1 - Puv) / Puv;
00605     }
00606   }
00607   for (int k = 0; k < LambdaV.Len(); k++) {
00608     int MaxEk = CIDNSetV[k].Len() * (CIDNSetV[k].Len() - 1) / 2;
00609     int NotEdgesInCom = MaxEk - ComEdgesV[k];
00610     GradV[k] = SumEdgeProbsV[k] - (double) NotEdgesInCom;
00611     if (LambdaV[k] > 0.0 && RegCoef > 0.0) { //if regularization exists
00612       GradV[k] -= RegCoef;
00613     }
00614   }
00615 }
00616 
00617 // Compute sum of lambda_c (which is log (1 - p_c)) over C_uv (ComK). It is used to compute edge probability P_uv.
00618 double TAGMFit::SelectLambdaSum(const TIntSet& ComK) { 
00619   return SelectLambdaSum(LambdaV, ComK); 
00620 }
00621 
00622 double TAGMFit::SelectLambdaSum(const TFltV& NewLambdaV, const TIntSet& ComK) {
00623   double Result = 0.0;
00624   for (TIntSet::TIter SI = ComK.BegI(); SI < ComK.EndI(); SI++) {
00625     IAssert(NewLambdaV[SI.GetKey()] >= 0);
00626     Result += NewLambdaV[SI.GetKey()];
00627   }
00628   return Result;
00629 }
00630 
00631 // Compute the empirical edge probability between a pair of nodes who share no community (epsilon), based on current community affiliations.
00632 double TAGMFit::CalcPNoComByCmtyVV(const int& SamplePairs) {
00633   TIntV NIdV;
00634   G->GetNIdV(NIdV);
00635   uint64 PairNoCom = 0, EdgesNoCom = 0;
00636   for (int u = 0; u < NIdV.Len(); u++) {
00637     for (int v = u + 1; v < NIdV.Len(); v++) {
00638       int SrcNID = NIdV[u], DstNID = NIdV[v];
00639       TIntSet JointCom;
00640       TAGMUtil::GetIntersection(NIDComVH.GetDat(SrcNID),NIDComVH.GetDat(DstNID),JointCom);
00641       if(JointCom.Len() == 0) {
00642         PairNoCom++;
00643         if (G->IsEdge(SrcNID, DstNID)) { EdgesNoCom++; }
00644         if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
00645       }
00646     }
00647     if (SamplePairs > 0 && PairNoCom >= (uint64) SamplePairs) { break; }
00648   }
00649   double DefaultVal = 1.0 / (double)G->GetNodes() / (double)G->GetNodes();
00650   if (EdgesNoCom > 0) {
00651     PNoCom = (double) EdgesNoCom / (double) PairNoCom;
00652   } else {
00653     PNoCom = DefaultVal;
00654   }
00655   printf("%s / %s edges without joint com detected (PNoCom = %f)\n", TUInt64::GetStr(EdgesNoCom).CStr(), TUInt64::GetStr(PairNoCom).CStr(), PNoCom.Val);
00656   return PNoCom;
00657 }
00658 
00659 void TAGMFit::PrintSummary() {
00660   TIntFltH CIDLambdaH(CIDNSetV.Len());
00661   for (int c = 0; c < CIDNSetV.Len(); c++) {
00662     CIDLambdaH.AddDat(c, LambdaV[c]);
00663   }
00664   CIDLambdaH.SortByDat(false);
00665   int Coms = 0;
00666   for (int i = 0; i < LambdaV.Len(); i++) {
00667     int CID = CIDLambdaH.GetKey(i);
00668     if (LambdaV[CID] <= 0.0001) { continue; }
00669     printf("P_c : %.3f Com Sz: %d, Total Edges inside: %d \n", 1.0 - exp(- LambdaV[CID]), CIDNSetV[CID].Len(), (int) ComEdgesV[CID]);
00670     Coms++;
00671   }
00672   printf("%d Communities, Total Memberships = %d, Likelihood = %.2f, Epsilon = %f\n", Coms, NIDCIDPrS.Len(), Likelihood(), PNoCom.Val);
00673 }