SNAP Library 2.1, User Reference  2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
agmfast.cpp
Go to the documentation of this file.
00001 
00002 #include "stdafx.h"
00003 #include "agmfast.h"
00004 #include "Snap.h"
00005 #include "agm.h"
00006 
00007 void TAGMFast::Save(TSOut& SOut) {
00008   G->Save(SOut);
00009   F.Save(SOut);
00010   NIDV.Save(SOut);
00011   RegCoef.Save(SOut);
00012   SumFV.Save(SOut);
00013   NodesOk.Save(SOut);
00014   MinVal.Save(SOut);
00015   MaxVal.Save(SOut);
00016   NegWgt.Save(SOut);
00017   NumComs.Save(SOut);
00018   HOVIDSV.Save(SOut);
00019   PNoCom.Save(SOut);
00020 }
00021 
00022 void TAGMFast::Load(TSIn& SIn, const int& RndSeed) {
00023   G->Load(SIn);
00024   F.Load(SIn);
00025   NIDV.Load(SIn);
00026   RegCoef.Load(SIn);
00027   SumFV.Load(SIn);
00028   NodesOk.Load(SIn);
00029   MinVal.Load(SIn);
00030   MaxVal.Load(SIn);
00031   NegWgt.Load(SIn);
00032   NumComs.Load(SIn);
00033   HOVIDSV.Load(SIn);
00034   PNoCom.Load(SIn);
00035   Rnd.PutSeed(RndSeed);
00036 }
00037 
00038 void TAGMFast::RandomInit(const int InitComs) {
00039   F.Gen(G->GetNodes());
00040   SumFV.Gen(InitComs);
00041   NumComs = InitComs;
00042   for (int u = 0; u < F.Len(); u++) {
00043     //assign to just one community
00044     int Mem = G->GetNI(u).GetDeg();
00045     if (Mem > 10) { Mem = 10; }
00046     for (int c = 0; c < Mem; c++) {
00047       int CID = Rnd.GetUniDevInt(InitComs);
00048       AddCom(u, CID, Rnd.GetUniDev());
00049     }
00050   }
00051   //assign a member to zero-member community (if any)
00052   for (int c = 0; c < SumFV.Len(); c++) {
00053     if (SumFV[c] == 0.0) {
00054       int UID = Rnd.GetUniDevInt(G->GetNodes());
00055       AddCom(UID, c, Rnd.GetUniDev());
00056     }
00057   }
00058 }
00059 
00060 void TAGMFast::NeighborComInit(const int InitComs) {
00061   //initialize with best neighborhood communities (Gleich et.al. KDD'12)
00062   F.Gen(G->GetNodes());
00063   SumFV.Gen(InitComs);
00064   NumComs = InitComs;
00065   const int Edges = G->GetEdges();
00066   //TIntFltH NCPhiH(F.Len());
00067   TFltIntPrV NIdPhiV(F.Len(), 0);
00068   TIntSet InvalidNIDS(F.Len());
00069   TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG
00070   TExeTm RunTm;
00071   //compute conductance of neighborhood community
00072   for (int u = 0; u < F.Len(); u++) {
00073     TIntSet NBCmty(G->GetNI(u).GetDeg() + 1);
00074     double Phi;
00075     if (G->GetNI(u).GetDeg() < 5) { //do not include nodes with too few degree
00076       Phi = 1.0; 
00077     } else {
00078       TAGMUtil::GetNbhCom(G, u, NBCmty);
00079       IAssert(NBCmty.Len() == G->GetNI(u).GetDeg() + 1);
00080       Phi = TAGMUtil::GetConductance(G, NBCmty, Edges);
00081     }
00082     //NCPhiH.AddDat(u, Phi);
00083     NIdPhiV.Add(TFltIntPr(Phi, u));
00084   }
00085   NIdPhiV.Sort(true);
00086   printf("conductance computation completed [%s]\n", RunTm.GetTmStr());
00087   fflush(stdout);
00088   //choose nodes with local minimum in conductance
00089   int CurCID = 0;
00090   for (int ui = 0; ui < NIdPhiV.Len(); ui++) {
00091     int UID = NIdPhiV[ui].Val2;
00092     fflush(stdout);
00093     if (InvalidNIDS.IsKey(UID)) { continue; }
00094     ChosenNIDV.Add(UID); //FOR DEBUG
00095     //add the node and its neighbors to the current community
00096     AddCom(UID, CurCID, 1.0);
00097     TUNGraph::TNodeI NI = G->GetNI(UID);
00098     fflush(stdout);
00099     for (int e = 0; e < NI.GetDeg(); e++) {
00100       AddCom(NI.GetNbrNId(e), CurCID, 1.0);
00101     }
00102     //exclude its neighbors from the next considerations
00103     for (int e = 0; e < NI.GetDeg(); e++) {
00104       InvalidNIDS.AddKey(NI.GetNbrNId(e));
00105     }
00106     CurCID++;
00107     fflush(stdout);
00108     if (CurCID >= NumComs) { break;  }
00109   }
00110   if (NumComs > CurCID) {
00111     printf("%d communities needed to fill randomly\n", NumComs - CurCID);
00112   }
00113   //assign a member to zero-member community (if any)
00114   for (int c = 0; c < SumFV.Len(); c++) {
00115     if (SumFV[c] == 0.0) {
00116       int ComSz = 10;
00117       for (int u = 0; u < ComSz; u++) {
00118         int UID = Rnd.GetUniDevInt(G->GetNodes());
00119         AddCom(UID, c, Rnd.GetUniDev());
00120       }
00121     }
00122   }
00123 }
00124 
00125 void TAGMFast::SetCmtyVV(const TVec<TIntV>& CmtyVV) {
00126   F.Gen(G->GetNodes());
00127   SumFV.Gen(CmtyVV.Len());
00128   NumComs = CmtyVV.Len();
00129   TIntH NIDIdxH(NIDV.Len());
00130   if (! NodesOk) {
00131     for (int u = 0; u < NIDV.Len(); u++) {
00132       NIDIdxH.AddDat(NIDV[u], u);
00133     }
00134   }
00135   for (int c = 0; c < CmtyVV.Len(); c++) {
00136     for (int u = 0; u < CmtyVV[c].Len(); u++) {
00137       int UID = CmtyVV[c][u];
00138       if (! NodesOk) { UID = NIDIdxH.GetDat(UID); }
00139       if (G->IsNode(UID)) { 
00140         AddCom(UID, c, 1.0);
00141       }
00142     }
00143   }
00144 }
00145 
00146 void TAGMFast::SetGraph(const PUNGraph& GraphPt) {
00147   G = GraphPt;
00148   HOVIDSV.Gen(G->GetNodes());  
00149   NodesOk = true;
00150   GraphPt->GetNIdV(NIDV);
00151   // check that nodes IDs are {0,1,..,Nodes-1}
00152   for (int nid = 0; nid < GraphPt->GetNodes(); nid++) {
00153     if (! GraphPt->IsNode(nid)) { 
00154       NodesOk = false; 
00155       break; 
00156     } 
00157   }
00158   if (! NodesOk) {
00159     printf("rearrage nodes\n");
00160     G = TSnap::GetSubGraph(GraphPt, NIDV, true);
00161     for (int nid = 0; nid < G->GetNodes(); nid++) {
00162       IAssert(G->IsNode(nid)); 
00163     }
00164   }
00165   TSnap::DelSelfEdges(G);
00166   
00167   PNoCom = 1.0 / (double) G->GetNodes();
00168   DoParallel = false;
00169   if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow
00170   NegWgt = 1.0;
00171 }
00172 
00173 double TAGMFast::Likelihood(const bool _DoParallel) { 
00174   TExeTm ExeTm;
00175   double L = 0.0;
00176   if (_DoParallel) {
00177   #pragma omp parallel for 
00178     for (int u = 0; u < F.Len(); u++) {
00179       double LU = LikelihoodForRow(u);
00180       #pragma omp atomic
00181         L += LU;
00182     }
00183   }
00184   else {
00185     for (int u = 0; u < F.Len(); u++) {
00186       double LU = LikelihoodForRow(u);
00187         L += LU;
00188     }
00189   }
00190   return L;
00191 }
00192 
00193 double TAGMFast::LikelihoodForRow(const int UID) {
00194   return LikelihoodForRow(UID, F[UID]);
00195 }
00196 
00197 
00198 double TAGMFast::LikelihoodForRow(const int UID, const TIntFltH& FU) {
00199   double L = 0.0;
00200   TFltV HOSumFV; //adjust for Fv of v hold out
00201   if (HOVIDSV[UID].Len() > 0) {
00202     HOSumFV.Gen(SumFV.Len());
00203     
00204     for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
00205       for (int c = 0; c < SumFV.Len(); c++) {
00206         HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
00207       }
00208     }
00209   }
00210 
00211   TUNGraph::TNodeI NI = G->GetNI(UID);
00212   if (DoParallel && NI.GetDeg() > 10) {
00213 #pragma omp parallel for schedule(static, 1)
00214     for (int e = 0; e < NI.GetDeg(); e++) {
00215       int v = NI.GetNbrNId(e);
00216       if (v == UID) { continue; }
00217       if (HOVIDSV[UID].IsKey(v)) { continue; }
00218       double LU = log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]);
00219 #pragma omp atomic
00220       L += LU;
00221     }
00222     for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) {
00223       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00224       double LU = NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat();
00225       L -= LU;
00226     }
00227   } else {
00228     for (int e = 0; e < NI.GetDeg(); e++) {
00229       int v = NI.GetNbrNId(e);
00230       if (v == UID) { continue; }
00231       if (HOVIDSV[UID].IsKey(v)) { continue; }
00232       L += log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]);
00233     }
00234     for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) {
00235       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00236       L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat();
00237     }
00238   }
00239   //add regularization
00240   if (RegCoef > 0.0) { //L1
00241     L -= RegCoef * Sum(FU);
00242   }
00243   if (RegCoef < 0.0) { //L2
00244     L += RegCoef * Norm2(FU);
00245   }
00246 
00247   return L;
00248 }
00249 
00250 void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) {
00251   GradU.Gen(CIDSet.Len());
00252 
00253   TFltV HOSumFV; //adjust for Fv of v hold out
00254   if (HOVIDSV[UID].Len() > 0) {
00255     HOSumFV.Gen(SumFV.Len());
00256     
00257     for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
00258       for (int c = 0; c < SumFV.Len(); c++) {
00259         HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
00260       }
00261     }
00262   }
00263     
00264   TUNGraph::TNodeI NI = G->GetNI(UID);
00265   int Deg = NI.GetDeg();
00266   TFltV PredV(Deg), GradV(CIDSet.Len());
00267   TIntV CIDV(CIDSet.Len());
00268   if (DoParallel && Deg + CIDSet.Len() > 10) {
00269 #pragma omp parallel for schedule(static, 1)
00270     for (int e = 0; e < Deg; e++) {
00271       if (NI.GetNbrNId(e) == UID) { continue; }
00272       if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
00273       PredV[e] = Prediction(UID, NI.GetNbrNId(e));
00274     }
00275   
00276 #pragma omp parallel for schedule(static, 1)
00277     for (int c = 0; c < CIDSet.Len(); c++) {
00278       int CID = CIDSet.GetKey(c);
00279       double Val = 0.0;
00280       for (int e = 0; e < Deg; e++) {
00281         int VID = NI.GetNbrNId(e);
00282         if (VID == UID) { continue; }
00283         if (HOVIDSV[UID].IsKey(VID)) { continue; }
00284         Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
00285       }
00286       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00287       Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
00288       CIDV[c] = CID;
00289       GradV[c] = Val;
00290     }
00291   } 
00292   else {
00293     for (int e = 0; e < Deg; e++) {
00294       if (NI.GetNbrNId(e) == UID) { continue; }
00295       if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
00296       PredV[e] = Prediction(UID, NI.GetNbrNId(e));
00297     }
00298   
00299     for (int c = 0; c < CIDSet.Len(); c++) {
00300       int CID = CIDSet.GetKey(c);
00301       double Val = 0.0;
00302       for (int e = 0; e < Deg; e++) {
00303         int VID = NI.GetNbrNId(e);
00304         if (VID == UID) { continue; }
00305         if (HOVIDSV[UID].IsKey(VID)) { continue; }
00306         Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
00307       }
00308       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00309       Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
00310       CIDV[c] = CID;
00311       GradV[c] = Val;
00312     }
00313   }
00314   //add regularization
00315   if (RegCoef > 0.0) { //L1
00316     for (int c = 0; c < GradV.Len(); c++) {
00317       GradV[c] -= RegCoef; 
00318     }
00319   }
00320   if (RegCoef < 0.0) { //L2
00321     for (int c = 0; c < GradV.Len(); c++) {
00322       GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); 
00323     }
00324   }
00325 
00326 
00327   for (int c = 0; c < GradV.Len(); c++) {
00328     if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; }
00329     if (fabs(GradV[c]) < 0.0001) { continue; }
00330     GradU.AddDat(CIDV[c], GradV[c]);
00331   }
00332   for (int c = 0; c < GradU.Len(); c++) {
00333     if (GradU[c] >= 10) { GradU[c] = 10; }
00334     if (GradU[c] <= -10) { GradU[c] = -10; }
00335     IAssert(GradU[c] >= -10);
00336   }
00337 }
00338 
00339 double TAGMFast::GradientForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) {
00340   TUNGraph::TNodeI UI = G->GetNI(UID);
00341   double Grad = 0.0, PNoEdge;
00342   int VID = 0;
00343   for (int e = 0; e < UI.GetDeg(); e++) {
00344     VID = UI.GetNbrNId(e);
00345     if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00346     if (! F[VID].IsKey(CID)) { continue; }
00347     PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val);
00348     IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0);
00349     //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge;
00350     Grad += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID));
00351   }
00352   Grad -= NegWgt * (SumFV[CID] - GetCom(UID, CID));
00353   //add regularization
00354   if (RegCoef > 0.0) { //L1
00355     Grad -= RegCoef; 
00356   }
00357   if (RegCoef < 0.0) { //L2
00358     Grad += 2 * RegCoef * Val; 
00359   }
00360 
00361   return Grad;
00362 }
00363 
00364 double TAGMFast::LikelihoodForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) {
00365   TUNGraph::TNodeI UI = G->GetNI(UID);
00366   double L = 0.0, PNoEdge;
00367   int VID = 0;
00368   for (int e = 0; e < UI.GetDeg(); e++) {
00369     VID = UI.GetNbrNId(e);
00370     if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00371     if (! F[VID].IsKey(CID)) { 
00372       PNoEdge = AlphaKV[e];
00373     } else {
00374       PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val);
00375     }
00376     IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0);
00377     //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge;
00378     L += log(1.0 - PNoEdge) + NegWgt * GetCom(VID, CID) * Val;
00379 
00380     //  += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID));
00381   }
00382   L -= NegWgt * (SumFV[CID] - GetCom(UID, CID)) * Val;
00383   //add regularization
00384   if (RegCoef > 0.0) { //L1
00385     L -= RegCoef * Val;
00386   }
00387   if (RegCoef < 0.0) { //L2
00388     L += RegCoef * Val * Val; 
00389   }
00390 
00391   return L;
00392 }
00393 
00394 double TAGMFast::HessianForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) {
00395   TUNGraph::TNodeI UI = G->GetNI(UID);
00396   double H = 0.0, PNoEdge;
00397   int VID = 0;
00398   for (int e = 0; e < UI.GetDeg(); e++) {
00399     VID = UI.GetNbrNId(e);
00400     if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00401     if (! F[VID].IsKey(CID)) { continue; }
00402     PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val);
00403     IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0);
00404     //PNoEdge = PNoEdge == 1.0? 1 - PNoCom: PNoEdge;
00405     H += (- PNoEdge * F[VID].GetDat(CID) * F[VID].GetDat(CID)) / (1.0 - PNoEdge) / (1.0 - PNoEdge);
00406   }
00407   //add regularization
00408   if (RegCoef < 0.0) { //L2
00409     H += 2 * RegCoef; 
00410   }
00411   IAssert (H <= 0.0);
00412   return H;
00413 }
00414 
00416 int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) {
00417   TExeTm ExeTm;
00418   int iter = 0, PrevIter = 0;
00419   TIntFltPrV IterLV;
00420   double PrevL = TFlt::Mn, CurL;
00421   TUNGraph::TNodeI UI;
00422   TIntV NIdxV;
00423   G->GetNIdV(NIdxV);
00424   int CID, UID, NewtonIter;
00425   double Fuc;
00426   //double PrevFuc;
00427   double Grad, H;
00428   while(iter < MaxIter) {
00429     NIdxV.Shuffle(Rnd);
00430     for (int ui = 0; ui < F.Len(); ui++, iter++) {
00431       if (! PlotNm.Empty() && iter % G->GetNodes() == 0) {
00432         IterLV.Add(TIntFltPr(iter, Likelihood(false)));
00433       }
00434       UID = NIdxV[ui];
00435       //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
00436       TIntSet CIDSet;
00437       UI = G->GetNI(UID);
00438       if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip
00439         if (! F[UID].Empty()) { F[UID].Clr(); }
00440         continue;
00441       }
00442       for (int e = 0; e < UI.GetDeg(); e++) {
00443         if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00444         TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
00445         for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
00446           CIDSet.AddKey(CI.GetKey());
00447         }
00448       }
00449       for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
00450         if (! CIDSet.IsKey(CI.GetKey())) {
00451           DelCom(UID, CI.GetKey());
00452         }
00453       }
00454       if (CIDSet.Empty()) { continue; }
00455       for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) {
00456         CID = CI.GetKey();
00457         //optimize for UID, CID
00458         //compute constants
00459         TFltV AlphaKV(UI.GetDeg());
00460         for (int e = 0; e < UI.GetDeg(); e++) {
00461           if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00462           AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID));
00463           IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID)));
00464         }
00465         Fuc = GetCom(UID, CID);
00466         //PrevFuc = Fuc;
00467         Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
00468         if (Grad <= 1e-3 && Grad >= -0.1) { continue; }
00469         NewtonIter = 0;
00470         while (NewtonIter++ < 10) {
00471           Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
00472           H = HessianForOneVar(AlphaKV, UID, CID, Fuc);
00473           if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; }
00474           if (fabs(Grad) < 1e-3) { break; }
00475           if (H == 0.0) { Fuc = 0.0; break; }
00476           double NewtonStep = - Grad / H;
00477           if (NewtonStep < -0.5) { NewtonStep = - 0.5; }
00478           Fuc += NewtonStep;
00479           if (Fuc < 0.0) { Fuc = 0.0; }
00480         }
00481         if (Fuc == 0.0) {
00482           DelCom(UID, CID);
00483         }
00484         else {
00485           AddCom(UID, CID, Fuc);
00486         }
00487       }
00488     }
00489     if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
00490       PrevIter = iter;
00491       CurL = Likelihood();
00492       if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
00493         printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL,  CurL - PrevL);
00494       }
00495       fflush(stdout);
00496       if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
00497       else { PrevL = CurL; }
00498     }
00499     
00500   }
00501   if (! PlotNm.Empty()) {
00502     printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
00503     TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
00504   }
00505   return iter;
00506 }
00507 
00508 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV) {
00509   GetCmtyVV(CmtyVV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3);
00510 }
00511 
00513 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) {
00514   CmtyVV.Gen(NumComs, 0);
00515   TIntFltH CIDSumFH(NumComs);
00516   for (int c = 0; c < SumFV.Len(); c++) {
00517     CIDSumFH.AddDat(c, SumFV[c]);
00518   }
00519   CIDSumFH.SortByDat(false);
00520   for (int c = 0; c < NumComs; c++) {
00521     int CID = CIDSumFH.GetKey(c);
00522     TIntFltH NIDFucH(F.Len() / 10);
00523     TIntV CmtyV;
00524     IAssert(SumFV[CID] == CIDSumFH.GetDat(CID));
00525     if (SumFV[CID] < Thres) { continue; }
00526     for (int u = 0; u < F.Len(); u++) {
00527       int NID = u;
00528       if (! NodesOk) { NID = NIDV[u]; }
00529       if (GetCom(u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(u, CID)); }
00530     }
00531     NIDFucH.SortByDat(false);
00532     NIDFucH.GetKeyV(CmtyV);
00533     if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); }
00534   }
00535   if ( NumComs != CmtyVV.Len()) {
00536     printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len());
00537   }
00538 }
00539 
00541 int TAGMFast::FindComsByCV(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const double StepAlpha, const double StepBeta) {
00542     double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs);
00543     TIntV ComsV;
00544     ComsV.Add(MinComs);
00545     while (ComsV.Len() < DivComs) {
00546       int NewComs = int(ComsV.Last() * ComsGap);
00547       if (NewComs == ComsV.Last().Val) { NewComs++; }
00548       ComsV.Add(NewComs);
00549     }
00550     if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); }
00551     return FindComsByCV(ComsV, 0.1, NumThreads, OutFNm + ".CV.likelihood", StepAlpha, StepBeta);
00552 }
00553 
00554 int TAGMFast::FindComsByCV(TIntV& ComsV, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) {
00555   if (ComsV.Len() == 0) {
00556     int MaxComs = G->GetNodes() / 5;
00557     ComsV.Add(2);
00558     while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); }
00559   }
00560   TIntPrV EdgeV(G->GetEdges(), 0);
00561   for (TUNGraph::TEdgeI EI = G->BegEI(); EI < G->EndEI(); EI++) {
00562     EdgeV.Add(TIntPr(EI.GetSrcNId(), EI.GetDstNId()));
00563   }
00564   EdgeV.Shuffle(Rnd);
00565   int MaxIterCV = 3;
00566 
00567   TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV);
00568   if (EdgeV.Len() > 50) { //if edges are many enough, use CV
00569     printf("generating hold out set\n");
00570     TIntV NIdV1, NIdV2;
00571     G->GetNIdV(NIdV1);
00572     G->GetNIdV(NIdV2);
00573     for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
00574       // generate holdout sets
00575       HoldOutSets[IterCV].Gen(G->GetNodes());
00576       const int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0);
00577       int HOCnt = 0;
00578       int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges());
00579       printf("holding out %d edges...\n", HOEdges);
00580       for (int he = 0; he < (int) HOEdges; he++) {
00581         HoldOutSets[IterCV][EdgeV[he].Val1].AddKey(EdgeV[he].Val2);
00582         HoldOutSets[IterCV][EdgeV[he].Val2].AddKey(EdgeV[he].Val1);
00583         HOCnt++;
00584       }
00585       printf("%d Edges hold out\n", HOCnt);
00586       while(HOCnt++ < HOTotal) {
00587         int SrcNID = Rnd.GetUniDevInt(G->GetNodes());
00588         int DstNID = Rnd.GetUniDevInt(G->GetNodes());
00589         HoldOutSets[IterCV][SrcNID].AddKey(DstNID);
00590         HoldOutSets[IterCV][DstNID].AddKey(SrcNID);
00591       }
00592     }
00593     printf("hold out set generated\n");
00594   }
00595 
00596   TFltV HOLV(ComsV.Len());
00597   TIntFltPrV ComsLV;
00598   for (int c = 0; c < ComsV.Len(); c++) {
00599     const int Coms = ComsV[c];
00600     printf("Try number of Coms:%d\n", Coms);
00601     NeighborComInit(Coms);
00602     printf("Initialized\n");
00603 
00604     if (EdgeV.Len() > 50) { //if edges are many enough, use CV
00605       for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
00606         HOVIDSV = HoldOutSets[IterCV];
00607 
00608         if (NumThreads == 1) {
00609           printf("MLE without parallelization begins\n");
00610           MLEGradAscent(0.05, 10 * G->GetNodes(), "", StepAlpha, StepBeta);
00611         } else {
00612           printf("MLE with parallelization begins\n");
00613           MLEGradAscentParallel(0.05, 100, NumThreads, "", StepAlpha, StepBeta);
00614         }
00615         double HOL = LikelihoodHoldOut();
00616         HOL = HOL < 0? HOL: TFlt::Mn;
00617         HOLV[c] += HOL;
00618       }
00619     }
00620     else {
00621       HOVIDSV.Gen(G->GetNodes());
00622       MLEGradAscent(0.0001, 100 * G->GetNodes(), "");
00623       double BIC = 2 * Likelihood() - (double) G->GetNodes() * Coms * 2.0 * log ( (double) G->GetNodes());
00624       HOLV[c] = BIC;
00625     }
00626   }
00627   int EstComs = 2;
00628   double MaxL = TFlt::Mn;
00629   printf("\n");
00630   for (int c = 0; c < ComsV.Len(); c++) {
00631     ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val));
00632     printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val);
00633     if (MaxL < HOLV[c]) {
00634       MaxL = HOLV[c];
00635       EstComs = ComsV[c];
00636     }
00637   }
00638   printf("\n");
00639   RandomInit(EstComs);
00640   HOVIDSV.Gen(G->GetNodes());
00641   if (! PlotLFNm.Empty()) {
00642     TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood");
00643   }
00644   return EstComs;
00645 }
00646 
00647 double TAGMFast::LikelihoodHoldOut(const bool DoParallel) { 
00648   double L = 0.0;
00649   for (int u = 0; u < HOVIDSV.Len(); u++) {
00650     for (int e = 0; e < HOVIDSV[u].Len(); e++) {
00651       int VID = HOVIDSV[u][e];
00652       if (VID == u) { continue; } 
00653       double Pred = Prediction(u, VID);
00654       if (G->IsEdge(u, VID)) {
00655         L += log(1.0 - Pred);
00656       }
00657       else {
00658         L += NegWgt * log(Pred);
00659       }
00660     }
00661   }
00662   return L;
00663 }
00664 
00665 double TAGMFast::GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) {
00666   double StepSize = 1.0;
00667   double InitLikelihood = LikelihoodForRow(UID);
00668   TIntFltH NewVarV(DeltaV.Len());
00669   for(int iter = 0; iter < MaxIter; iter++) {
00670     for (int i = 0; i < DeltaV.Len(); i++){
00671       int CID = DeltaV.GetKey(i);
00672       double NewVal = GetCom(UID, CID) + StepSize * DeltaV.GetDat(CID);
00673       if (NewVal < MinVal) { NewVal = MinVal; }
00674       if (NewVal > MaxVal) { NewVal = MaxVal; }
00675       NewVarV.AddDat(CID, NewVal);
00676     }
00677     if (LikelihoodForRow(UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) {
00678       StepSize *= Beta;
00679     } else {
00680       break;
00681     }
00682     if (iter == MaxIter - 1) { 
00683       StepSize = 0.0;
00684       break;
00685     }
00686   }
00687   return StepSize;
00688 }
00689 
00690 int TAGMFast::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
00691   time_t InitTime = time(NULL);
00692   TExeTm ExeTm, CheckTm;
00693   int iter = 0, PrevIter = 0;
00694   TIntFltPrV IterLV;
00695   TUNGraph::TNodeI UI;
00696   double PrevL = TFlt::Mn, CurL = 0.0;
00697   TIntV NIdxV(F.Len(), 0);
00698   for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
00699   IAssert(NIdxV.Len() == F.Len());
00700   TIntFltH GradV;
00701   while(iter < MaxIter) {
00702     NIdxV.Shuffle(Rnd);
00703     for (int ui = 0; ui < F.Len(); ui++, iter++) {
00704       int u = NIdxV[ui]; //
00705       //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
00706       UI = G->GetNI(u);
00707       TIntSet CIDSet(5 * UI.GetDeg());
00708       for (int e = 0; e < UI.GetDeg(); e++) {
00709         if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
00710         TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
00711         for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
00712           CIDSet.AddKey(CI.GetKey());
00713         }
00714       }
00715       for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
00716         if (! CIDSet.IsKey(CI.GetKey())) {
00717           DelCom(u, CI.GetKey());
00718         }
00719       }
00720       if (CIDSet.Empty()) { continue; }
00721       GradientForRow(u, GradV, CIDSet);
00722       if (Norm2(GradV) < 1e-4) { continue; }
00723       double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta);
00724       if (LearnRate == 0.0) { continue; }
00725       for (int ci = 0; ci < GradV.Len(); ci++) {
00726         int CID = GradV.GetKey(ci);
00727         double Change = LearnRate * GradV.GetDat(CID);
00728         double NewFuc = GetCom(u, CID) + Change;
00729         if (NewFuc <= 0.0) {
00730           DelCom(u, CID);
00731         } else {
00732           AddCom(u, CID, NewFuc);
00733         }
00734       }
00735       if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) {
00736         IterLV.Add(TIntFltPr(iter, Likelihood(false)));
00737       }
00738     }
00739     printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime);
00740     fflush(stdout);
00741     if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
00742       PrevIter = iter;
00743       CurL = Likelihood();
00744       if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
00745         printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL,  CurL - PrevL);
00746       }
00747       fflush(stdout);
00748       if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
00749       else { PrevL = CurL; }
00750     }
00751     
00752   }
00753   printf("\n");
00754   printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
00755   if (! PlotNm.Empty()) {
00756     TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
00757   }
00758   return iter;
00759 }
00760 
00761 int TAGMFast::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
00762   //parallel
00763   time_t InitTime = time(NULL);
00764   uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs();
00765   TExeTm ExeTm, CheckTm;
00766   double PrevL = Likelihood(true);
00767   TIntFltPrV IterLV;
00768   int PrevIter = 0;
00769   int iter = 0;
00770   TIntV NIdxV(F.Len(), 0);
00771   for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
00772   TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization
00773   NIDOPTV.PutAll(0);
00774   TVec<TIntFltH> NewF(ChunkNum * ChunkSize);
00775   TIntV NewNIDV(ChunkNum * ChunkSize);
00776   for (iter = 0; iter < MaxIter; iter++) {
00777     NIdxV.Clr(false);
00778     for (int i = 0; i < F.Len(); i++) { 
00779       if (NIDOPTV[i] == 0) {  NIdxV.Add(i); }
00780     }
00781     IAssert (NIdxV.Len() <= F.Len());
00782     NIdxV.Shuffle(Rnd);
00783     // compute gradient for chunk of nodes
00784 #pragma omp parallel for schedule(static, 1)
00785     for (int TIdx = 0; TIdx < ChunkNum; TIdx++) {
00786       TIntFltH GradV;
00787       for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) {
00788         NewNIDV[ui] = -1;
00789         if (ui > NIdxV.Len()) { continue; }
00790         int u = NIdxV[ui]; //
00791         //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
00792         TUNGraph::TNodeI UI = G->GetNI(u);
00793         TIntSet CIDSet(5 * UI.GetDeg());
00794         TIntFltH CurFU = F[u];
00795         for (int e = 0; e < UI.GetDeg(); e++) {
00796           if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
00797           TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
00798           for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
00799             CIDSet.AddKey(CI.GetKey());
00800           }
00801         }
00802         if (CIDSet.Empty()) { 
00803           CurFU.Clr();
00804         }
00805         else {
00806           for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors
00807             if (! CIDSet.IsKey(CI.GetKey())) {
00808               CurFU.DelIfKey(CI.GetKey());
00809             }
00810           }
00811           GradientForRow(u, GradV, CIDSet);
00812           if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; }
00813           double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta, 5);
00814           if (LearnRate <= 1e-5) { NewNIDV[ui] = -2; continue; }
00815           for (int ci = 0; ci < GradV.Len(); ci++) {
00816             int CID = GradV.GetKey(ci);
00817             double Change = LearnRate * GradV.GetDat(CID);
00818             double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change;
00819             if (NewFuc <= 0.0) {
00820               CurFU.DelIfKey(CID);
00821             } else {
00822               CurFU.AddDat(CID) = NewFuc;
00823             }
00824           }
00825           CurFU.Defrag();
00826         }
00827         //store changes
00828         NewF[ui] = CurFU;
00829         NewNIDV[ui] = u;
00830       }
00831     }
00832     int NumNoChangeGrad = 0;
00833     int NumNoChangeStepSize = 0;
00834     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00835       int NewNID = NewNIDV[ui];
00836       if (NewNID == -1) { NumNoChangeGrad++; continue; }
00837       if (NewNID == -2) { NumNoChangeStepSize++; continue; }
00838       for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
00839         SumFV[CI.GetKey()] -= CI.GetDat();
00840       }
00841     }
00842 #pragma omp parallel for
00843     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00844       int NewNID = NewNIDV[ui];
00845       if (NewNID < 0) { continue; }
00846       F[NewNID] = NewF[ui];
00847     }
00848     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00849       int NewNID = NewNIDV[ui];
00850       if (NewNID < 0) { continue; }
00851       for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
00852         SumFV[CI.GetKey()] += CI.GetDat();
00853       }
00854     }
00855     // update the nodes who are optimal
00856     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00857       int NewNID = NewNIDV[ui];
00858       if (NewNID < 0) { continue; }
00859       TUNGraph::TNodeI UI = G->GetNI(NewNID);
00860       NIDOPTV[NewNID] = 0;
00861       for (int e = 0; e < UI.GetDeg(); e++) {
00862         NIDOPTV[UI.GetNbrNId(e)] = 0;
00863       }
00864     }
00865     int OPTCnt = 0;
00866     for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } }
00867     if (! PlotNm.Empty()) {
00868       printf("\r%d iterations [%s] %d secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), int(TSecTm::GetCurTm().GetAbsSecs() - StartTm));
00869       if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); }
00870       fflush(stdout);
00871     }
00872     if ((iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) {
00873       PrevIter = iter;
00874       double CurL = Likelihood(true);
00875       IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL));
00876       printf("\r%d iterations, Likelihood: %f, Diff: %f [%d secs]", iter, CurL,  CurL - PrevL, int(time(NULL) - InitTime));
00877        fflush(stdout);
00878       if (CurL - PrevL <= Thres * fabs(PrevL)) { 
00879         break;
00880       }
00881       else {
00882         PrevL = CurL;
00883       }
00884     }
00885   }
00886   if (! PlotNm.Empty()) {
00887     printf("\nMLE completed with %d iterations(%d secs)\n", iter, int(TSecTm::GetCurTm().GetAbsSecs() - StartTm));
00888     TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
00889   } else {
00890     printf("\rMLE completed with %d iterations(%d secs)", iter, int(time(NULL) - InitTime));
00891     fflush(stdout);
00892   }
00893   return iter;
00894 }