SNAP Library 2.0, User Reference  2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
agmfast.cpp
Go to the documentation of this file.
00001 
00002 #include "stdafx.h"
00003 #include "agmfast.h"
00004 #include "Snap.h"
00005 #include "agm.h"
00006 
00007 void TAGMFast::Save(TSOut& SOut) {
00008   G->Save(SOut);
00009   F.Save(SOut);
00010   NIDV.Save(SOut);
00011   RegCoef.Save(SOut);
00012   SumFV.Save(SOut);
00013   NodesOk.Save(SOut);
00014   MinVal.Save(SOut);
00015   MaxVal.Save(SOut);
00016   NegWgt.Save(SOut);
00017   NumComs.Save(SOut);
00018   HOVIDSV.Save(SOut);
00019   PNoCom.Save(SOut);
00020 }
00021 
00022 void TAGMFast::Load(TSIn& SIn, const int& RndSeed) {
00023   G->Load(SIn);
00024   F.Load(SIn);
00025   NIDV.Load(SIn);
00026   RegCoef.Load(SIn);
00027   SumFV.Load(SIn);
00028   NodesOk.Load(SIn);
00029   MinVal.Load(SIn);
00030   MaxVal.Load(SIn);
00031   NegWgt.Load(SIn);
00032   NumComs.Load(SIn);
00033   HOVIDSV.Load(SIn);
00034   PNoCom.Load(SIn);
00035   Rnd.PutSeed(RndSeed);
00036 }
00037 
00038 void TAGMFast::RandomInit(const int InitComs) {
00039   F.Gen(G->GetNodes());
00040   SumFV.Gen(InitComs);
00041   NumComs = InitComs;
00042   for (int u = 0; u < F.Len(); u++) {
00043     //assign to just one community
00044     int Mem = G->GetNI(u).GetDeg();
00045     if (Mem > 10) { Mem = 10; }
00046     for (int c = 0; c < Mem; c++) {
00047       int CID = Rnd.GetUniDevInt(InitComs);
00048       AddCom(u, CID, Rnd.GetUniDev());
00049     }
00050   }
00051   //assign a member to zero-member community (if any)
00052   for (int c = 0; c < SumFV.Len(); c++) {
00053     if (SumFV[c] == 0.0) {
00054       int UID = Rnd.GetUniDevInt(G->GetNodes());
00055       AddCom(UID, c, Rnd.GetUniDev());
00056     }
00057   }
00058 }
00059 
00060 void TAGMFast::NeighborComInit(const int InitComs) {
00061   //initialize with best neighborhood communities (Gleich et.al. KDD'12)
00062   F.Gen(G->GetNodes());
00063   SumFV.Gen(InitComs);
00064   NumComs = InitComs;
00065   const int Edges = G->GetEdges();
00066   //TIntFltH NCPhiH(F.Len());
00067   TFltIntPrV NIdPhiV(F.Len(), 0);
00068   TIntSet InvalidNIDS(F.Len());
00069   TIntV ChosenNIDV(InitComs, 0); //FOR DEBUG
00070   TExeTm RunTm;
00071   //compute conductance of neighborhood community
00072   for (int u = 0; u < F.Len(); u++) {
00073     TIntSet NBCmty(G->GetNI(u).GetDeg() + 1);
00074     double Phi;
00075     if (G->GetNI(u).GetDeg() < 5) { //do not include nodes with too few degree
00076       Phi = 1.0; 
00077     } else {
00078       TAGMUtil::GetNbhCom(G, u, NBCmty);
00079       IAssert(NBCmty.Len() == G->GetNI(u).GetDeg() + 1);
00080       Phi = TAGMUtil::GetConductance(G, NBCmty, Edges);
00081     }
00082     //NCPhiH.AddDat(u, Phi);
00083     NIdPhiV.Add(TFltIntPr(Phi, u));
00084   }
00085   NIdPhiV.Sort(true);
00086   printf("conductance computation completed [%s]\n", RunTm.GetTmStr());
00087   fflush(stdout);
00088   //choose nodes with local minimum in conductance
00089   int CurCID = 0;
00090   for (int ui = 0; ui < NIdPhiV.Len(); ui++) {
00091     int UID = NIdPhiV[ui].Val2;
00092     fflush(stdout);
00093     if (InvalidNIDS.IsKey(UID)) { continue; }
00094     ChosenNIDV.Add(UID); //FOR DEBUG
00095     //add the node and its neighbors to the current community
00096     AddCom(UID, CurCID, 1.0);
00097     TUNGraph::TNodeI NI = G->GetNI(UID);
00098     fflush(stdout);
00099     for (int e = 0; e < NI.GetDeg(); e++) {
00100       AddCom(NI.GetNbrNId(e), CurCID, 1.0);
00101     }
00102     //exclude its neighbors from the next considerations
00103     for (int e = 0; e < NI.GetDeg(); e++) {
00104       InvalidNIDS.AddKey(NI.GetNbrNId(e));
00105     }
00106     CurCID++;
00107     fflush(stdout);
00108     if (CurCID >= NumComs) { break;  }
00109   }
00110   if (NumComs > CurCID) {
00111     printf("%d communities needed to fill randomly\n", NumComs - CurCID);
00112   }
00113   //assign a member to zero-member community (if any)
00114   for (int c = 0; c < SumFV.Len(); c++) {
00115     if (SumFV[c] == 0.0) {
00116       int ComSz = 10;
00117       for (int u = 0; u < ComSz; u++) {
00118         int UID = Rnd.GetUniDevInt(G->GetNodes());
00119         AddCom(UID, c, Rnd.GetUniDev());
00120       }
00121     }
00122   }
00123 }
00124 
00125 void TAGMFast::SetCmtyVV(const TVec<TIntV>& CmtyVV) {
00126   F.Gen(G->GetNodes());
00127   SumFV.Gen(CmtyVV.Len());
00128   NumComs = CmtyVV.Len();
00129   TIntH NIDIdxH(NIDV.Len());
00130   if (! NodesOk) {
00131     for (int u = 0; u < NIDV.Len(); u++) {
00132       NIDIdxH.AddDat(NIDV[u], u);
00133     }
00134   }
00135   for (int c = 0; c < CmtyVV.Len(); c++) {
00136     for (int u = 0; u < CmtyVV[c].Len(); u++) {
00137       int UID = CmtyVV[c][u];
00138       if (! NodesOk) { UID = NIDIdxH.GetDat(UID); }
00139       if (G->IsNode(UID)) { 
00140         AddCom(UID, c, 1.0);
00141       }
00142     }
00143   }
00144 }
00145 
00146 void TAGMFast::SetGraph(const PUNGraph& GraphPt) {
00147   G = GraphPt;
00148   HOVIDSV.Gen(G->GetNodes());  
00149   NodesOk = true;
00150   GraphPt->GetNIdV(NIDV);
00151   // check that nodes IDs are {0,1,..,Nodes-1}
00152   for (int nid = 0; nid < GraphPt->GetNodes(); nid++) {
00153     if (! GraphPt->IsNode(nid)) { 
00154       NodesOk = false; 
00155       break; 
00156     } 
00157   }
00158   if (! NodesOk) {
00159     printf("rearrage nodes\n");
00160     G = TSnap::GetSubGraph(GraphPt, NIDV, true);
00161     for (int nid = 0; nid < G->GetNodes(); nid++) {
00162       IAssert(G->IsNode(nid)); 
00163     }
00164   }
00165   TSnap::DelSelfEdges(G);
00166   
00167   PNoCom = 1.0 / (double) G->GetNodes();
00168   DoParallel = false;
00169   if (1.0 / PNoCom > sqrt(TFlt::Mx)) { PNoCom = 0.99 / sqrt(TFlt::Mx); } // to prevent overflow
00170   NegWgt = 1.0;
00171 }
00172 
00173 double TAGMFast::Likelihood(const bool _DoParallel) { 
00174   TExeTm ExeTm;
00175   double L = 0.0;
00176   if (_DoParallel) {
00177   #pragma omp parallel for 
00178     for (int u = 0; u < F.Len(); u++) {
00179       double LU = LikelihoodForRow(u);
00180       #pragma omp atomic
00181         L += LU;
00182     }
00183   }
00184   else {
00185     for (int u = 0; u < F.Len(); u++) {
00186       double LU = LikelihoodForRow(u);
00187         L += LU;
00188     }
00189   }
00190   return L;
00191 }
00192 
00193 double TAGMFast::LikelihoodForRow(const int UID) {
00194   return LikelihoodForRow(UID, F[UID]);
00195 }
00196 
00197 
00198 double TAGMFast::LikelihoodForRow(const int UID, const TIntFltH& FU) {
00199   double L = 0.0;
00200   TFltV HOSumFV; //adjust for Fv of v hold out
00201   if (HOVIDSV[UID].Len() > 0) {
00202     HOSumFV.Gen(SumFV.Len());
00203     
00204     for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
00205       for (int c = 0; c < SumFV.Len(); c++) {
00206         HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
00207       }
00208     }
00209   }
00210 
00211   TUNGraph::TNodeI NI = G->GetNI(UID);
00212   if (DoParallel && NI.GetDeg() > 10) {
00213 #pragma omp parallel for schedule(static, 1)
00214     for (int e = 0; e < NI.GetDeg(); e++) {
00215       int v = NI.GetNbrNId(e);
00216       if (v == UID) { continue; }
00217       if (HOVIDSV[UID].IsKey(v)) { continue; }
00218       double LU = log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]);
00219 #pragma omp atomic
00220       L += LU;
00221     }
00222     for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) {
00223       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00224       double LU = NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat();
00225       L -= LU;
00226     }
00227   } else {
00228     for (int e = 0; e < NI.GetDeg(); e++) {
00229       int v = NI.GetNbrNId(e);
00230       if (v == UID) { continue; }
00231       if (HOVIDSV[UID].IsKey(v)) { continue; }
00232       L += log (1.0 - Prediction(FU, F[v])) + NegWgt * DotProduct(FU, F[v]);
00233     }
00234     for (TIntFltH::TIter HI = FU.BegI(); HI < FU.EndI(); HI++) {
00235       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[HI.GetKey()].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00236       L -= NegWgt * (SumFV[HI.GetKey()] - HOSum - GetCom(UID, HI.GetKey())) * HI.GetDat();
00237     }
00238   }
00239   //add regularization
00240   if (RegCoef > 0.0) { //L1
00241     L -= RegCoef * Sum(FU);
00242   }
00243   if (RegCoef < 0.0) { //L2
00244     L += RegCoef * Norm2(FU);
00245   }
00246 
00247   return L;
00248 }
00249 
00250 void TAGMFast::GradientForRow(const int UID, TIntFltH& GradU, const TIntSet& CIDSet) {
00251   GradU.Gen(CIDSet.Len());
00252 
00253   TFltV HOSumFV; //adjust for Fv of v hold out
00254   if (HOVIDSV[UID].Len() > 0) {
00255     HOSumFV.Gen(SumFV.Len());
00256     
00257     for (int e = 0; e < HOVIDSV[UID].Len(); e++) {
00258       for (int c = 0; c < SumFV.Len(); c++) {
00259         HOSumFV[c] += GetCom(HOVIDSV[UID][e], c);
00260       }
00261     }
00262   }
00263     
00264   TUNGraph::TNodeI NI = G->GetNI(UID);
00265   int Deg = NI.GetDeg();
00266   TFltV PredV(Deg), GradV(CIDSet.Len());
00267   TIntV CIDV(CIDSet.Len());
00268   if (DoParallel && Deg + CIDSet.Len() > 10) {
00269 #pragma omp parallel for schedule(static, 1)
00270     for (int e = 0; e < Deg; e++) {
00271       if (NI.GetNbrNId(e) == UID) { continue; }
00272       if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
00273       PredV[e] = Prediction(UID, NI.GetNbrNId(e));
00274     }
00275   
00276 #pragma omp parallel for schedule(static, 1)
00277     for (int c = 0; c < CIDSet.Len(); c++) {
00278       int CID = CIDSet.GetKey(c);
00279       double Val = 0.0;
00280       for (int e = 0; e < Deg; e++) {
00281         int VID = NI.GetNbrNId(e);
00282         if (VID == UID) { continue; }
00283         if (HOVIDSV[UID].IsKey(VID)) { continue; }
00284         Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
00285       }
00286       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00287       Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
00288       CIDV[c] = CID;
00289       GradV[c] = Val;
00290     }
00291   } 
00292   else {
00293     for (int e = 0; e < Deg; e++) {
00294       if (NI.GetNbrNId(e) == UID) { continue; }
00295       if (HOVIDSV[UID].IsKey(NI.GetNbrNId(e))) { continue; }
00296       PredV[e] = Prediction(UID, NI.GetNbrNId(e));
00297     }
00298   
00299     for (int c = 0; c < CIDSet.Len(); c++) {
00300       int CID = CIDSet.GetKey(c);
00301       double Val = 0.0;
00302       for (int e = 0; e < Deg; e++) {
00303         int VID = NI.GetNbrNId(e);
00304         if (VID == UID) { continue; }
00305         if (HOVIDSV[UID].IsKey(VID)) { continue; }
00306         Val += PredV[e] * GetCom(VID, CID) / (1.0 - PredV[e]) + NegWgt * GetCom(VID, CID);
00307       }
00308       double HOSum = HOVIDSV[UID].Len() > 0?  HOSumFV[CID].Val: 0.0;//subtract Hold out pairs only if hold out pairs exist
00309       Val -= NegWgt * (SumFV[CID] - HOSum - GetCom(UID, CID));
00310       CIDV[c] = CID;
00311       GradV[c] = Val;
00312     }
00313   }
00314   //add regularization
00315   if (RegCoef > 0.0) { //L1
00316     for (int c = 0; c < GradV.Len(); c++) {
00317       GradV[c] -= RegCoef; 
00318     }
00319   }
00320   if (RegCoef < 0.0) { //L2
00321     for (int c = 0; c < GradV.Len(); c++) {
00322       GradV[c] += 2 * RegCoef * GetCom(UID, CIDV[c]); 
00323     }
00324   }
00325 
00326 
00327   for (int c = 0; c < GradV.Len(); c++) {
00328     if (GetCom(UID, CIDV[c]) == 0.0 && GradV[c] < 0.0) { continue; }
00329     if (fabs(GradV[c]) < 0.0001) { continue; }
00330     GradU.AddDat(CIDV[c], GradV[c]);
00331   }
00332   for (int c = 0; c < GradU.Len(); c++) {
00333     if (GradU[c] >= 10) { GradU[c] = 10; }
00334     if (GradU[c] <= -10) { GradU[c] = -10; }
00335     IAssert(GradU[c] >= -10);
00336   }
00337 }
00338 
00339 double TAGMFast::GradientForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) {
00340   TUNGraph::TNodeI UI = G->GetNI(UID);
00341   double Grad = 0.0, PNoEdge;
00342   int VID = 0;
00343   for (int e = 0; e < UI.GetDeg(); e++) {
00344     VID = UI.GetNbrNId(e);
00345     if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00346     if (! F[VID].IsKey(CID)) { continue; }
00347     PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val);
00348     IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0);
00349     //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge;
00350     Grad += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID));
00351   }
00352   Grad -= NegWgt * (SumFV[CID] - GetCom(UID, CID));
00353   //add regularization
00354   if (RegCoef > 0.0) { //L1
00355     Grad -= RegCoef; 
00356   }
00357   if (RegCoef < 0.0) { //L2
00358     Grad += 2 * RegCoef * Val; 
00359   }
00360 
00361   return Grad;
00362 }
00363 
00364 double TAGMFast::LikelihoodForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) {
00365   TUNGraph::TNodeI UI = G->GetNI(UID);
00366   double L = 0.0, PNoEdge;
00367   int VID = 0;
00368   for (int e = 0; e < UI.GetDeg(); e++) {
00369     VID = UI.GetNbrNId(e);
00370     if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00371     if (! F[VID].IsKey(CID)) { 
00372       PNoEdge = AlphaKV[e];
00373     } else {
00374       PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val);
00375     }
00376     IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0);
00377     //PNoEdge = PNoEdge >= 1.0 - PNoCom? 1 - PNoCom: PNoEdge;
00378     L += log(1.0 - PNoEdge) + NegWgt * GetCom(VID, CID) * Val;
00379 
00380     //  += ((PNoEdge * F[VID].GetDat(CID)) / (1.0 - PNoEdge) + NegWgt * F[VID].GetDat(CID));
00381   }
00382   L -= NegWgt * (SumFV[CID] - GetCom(UID, CID)) * Val;
00383   //add regularization
00384   if (RegCoef > 0.0) { //L1
00385     L -= RegCoef * Val;
00386   }
00387   if (RegCoef < 0.0) { //L2
00388     L += RegCoef * Val * Val; 
00389   }
00390 
00391   return L;
00392 }
00393 
00394 double TAGMFast::HessianForOneVar(const TFltV& AlphaKV, const int UID, const int CID, const double& Val) {
00395   TUNGraph::TNodeI UI = G->GetNI(UID);
00396   double H = 0.0, PNoEdge;
00397   int VID = 0;
00398   for (int e = 0; e < UI.GetDeg(); e++) {
00399     VID = UI.GetNbrNId(e);
00400     if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00401     if (! F[VID].IsKey(CID)) { continue; }
00402     PNoEdge = AlphaKV[e] * exp (- F[VID].GetDat(CID) * Val);
00403     IAssert(PNoEdge <= 1.0 && PNoEdge >= 0.0);
00404     //PNoEdge = PNoEdge == 1.0? 1 - PNoCom: PNoEdge;
00405     H += (- PNoEdge * F[VID].GetDat(CID) * F[VID].GetDat(CID)) / (1.0 - PNoEdge) / (1.0 - PNoEdge);
00406   }
00407   //add regularization
00408   if (RegCoef < 0.0) { //L2
00409     H += 2 * RegCoef; 
00410   }
00411   IAssert (H <= 0.0);
00412   return H;
00413 }
00414 
00416 int TAGMFast::MLENewton(const double& Thres, const int& MaxIter, const TStr PlotNm) {
00417   TExeTm ExeTm;
00418   int iter = 0, PrevIter = 0;
00419   TIntFltPrV IterLV;
00420   double PrevL = TFlt::Mn, CurL;
00421   TUNGraph::TNodeI UI;
00422   TIntV NIdxV;
00423   G->GetNIdV(NIdxV);
00424   int CID, UID, NewtonIter;
00425   double Fuc, PrevFuc, Grad, H;
00426   while(iter < MaxIter) {
00427     NIdxV.Shuffle(Rnd);
00428     for (int ui = 0; ui < F.Len(); ui++, iter++) {
00429       if (! PlotNm.Empty() && iter % G->GetNodes() == 0) {
00430         IterLV.Add(TIntFltPr(iter, Likelihood(false)));
00431       }
00432       UID = NIdxV[ui];
00433       //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
00434       TIntSet CIDSet;
00435       UI = G->GetNI(UID);
00436       if (UI.GetDeg() == 0) { //if the node is isolated, clear its membership and skip
00437         if (! F[UID].Empty()) { F[UID].Clr(); }
00438         continue;
00439       }
00440       for (int e = 0; e < UI.GetDeg(); e++) {
00441         if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00442         TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
00443         for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
00444           CIDSet.AddKey(CI.GetKey());
00445         }
00446       }
00447       for (TIntFltH::TIter CI = F[UID].BegI(); CI < F[UID].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
00448         if (! CIDSet.IsKey(CI.GetKey())) {
00449           DelCom(UID, CI.GetKey());
00450         }
00451       }
00452       if (CIDSet.Empty()) { continue; }
00453       for (TIntSet::TIter CI = CIDSet.BegI(); CI < CIDSet.EndI(); CI++) {
00454         CID = CI.GetKey();
00455         //optimize for UID, CID
00456         //compute constants
00457         TFltV AlphaKV(UI.GetDeg());
00458         for (int e = 0; e < UI.GetDeg(); e++) {
00459           if (HOVIDSV[UID].IsKey(UI.GetNbrNId(e))) { continue; }
00460           AlphaKV[e] = (1 - PNoCom) * exp(- DotProduct(UID, UI.GetNbrNId(e)) + GetCom(UI.GetNbrNId(e), CID) * GetCom(UID, CID));
00461           IAssertR(AlphaKV[e] <= 1.0, TStr::Fmt("AlphaKV=%f, %f, %f", AlphaKV[e].Val, PNoCom.Val, GetCom(UI.GetNbrNId(e), CID)));
00462         }
00463         Fuc = GetCom(UID, CID);
00464         PrevFuc = Fuc;
00465         Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
00466         if (Grad <= 1e-3 && Grad >= -0.1) { continue; }
00467         NewtonIter = 0;
00468         while (NewtonIter++ < 10) {
00469           Grad = GradientForOneVar(AlphaKV, UID, CID, Fuc), H = 0.0;
00470           H = HessianForOneVar(AlphaKV, UID, CID, Fuc);
00471           if (Fuc == 0.0 && Grad <= 0.0) { Grad = 0.0; }
00472           if (fabs(Grad) < 1e-3) { break; }
00473           if (H == 0.0) { Fuc = 0.0; break; }
00474           double NewtonStep = - Grad / H;
00475           if (NewtonStep < -0.5) { NewtonStep = - 0.5; }
00476           Fuc += NewtonStep;
00477           if (Fuc < 0.0) { Fuc = 0.0; }
00478         }
00479         if (Fuc == 0.0) {
00480           DelCom(UID, CID);
00481         }
00482         else {
00483           AddCom(UID, CID, Fuc);
00484         }
00485       }
00486     }
00487     if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
00488       PrevIter = iter;
00489       CurL = Likelihood();
00490       if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
00491         printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL,  CurL - PrevL);
00492       }
00493       fflush(stdout);
00494       if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
00495       else { PrevL = CurL; }
00496     }
00497     
00498   }
00499   if (! PlotNm.Empty()) {
00500     printf("\nMLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
00501     TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
00502   }
00503   return iter;
00504 }
00505 
00506 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV) {
00507   GetCmtyVV(CmtyVV, sqrt(2.0 * (double) G->GetEdges() / G->GetNodes() / G->GetNodes()), 3);
00508 }
00509 
00511 void TAGMFast::GetCmtyVV(TVec<TIntV>& CmtyVV, const double Thres, const int MinSz) {
00512   CmtyVV.Gen(NumComs, 0);
00513   TIntFltH CIDSumFH(NumComs);
00514   for (int c = 0; c < SumFV.Len(); c++) {
00515     CIDSumFH.AddDat(c, SumFV[c]);
00516   }
00517   CIDSumFH.SortByDat(false);
00518   for (int c = 0; c < NumComs; c++) {
00519     int CID = CIDSumFH.GetKey(c);
00520     TIntFltH NIDFucH(F.Len() / 10);
00521     TIntV CmtyV;
00522     IAssert(SumFV[CID] == CIDSumFH.GetDat(CID));
00523     if (SumFV[CID] < Thres) { continue; }
00524     for (int u = 0; u < F.Len(); u++) {
00525       int NID = u;
00526       if (! NodesOk) { NID = NIDV[u]; }
00527       if (GetCom(u, CID) >= Thres) { NIDFucH.AddDat(NID, GetCom(u, CID)); }
00528     }
00529     NIDFucH.SortByDat(false);
00530     NIDFucH.GetKeyV(CmtyV);
00531     if (CmtyV.Len() >= MinSz) { CmtyVV.Add(CmtyV); }
00532   }
00533   if ( NumComs != CmtyVV.Len()) {
00534     printf("Community vector generated. %d communities are ommitted\n", NumComs.Val - CmtyVV.Len());
00535   }
00536 }
00537 
00539 int TAGMFast::FindComsByCV(const int NumThreads, const int MaxComs, const int MinComs, const int DivComs, const TStr OutFNm, const double StepAlpha, const double StepBeta) {
00540     double ComsGap = exp(TMath::Log((double) MaxComs / (double) MinComs) / (double) DivComs);
00541     TIntV ComsV;
00542     ComsV.Add(MinComs);
00543     while (ComsV.Len() < DivComs) {
00544       int NewComs = int(ComsV.Last() * ComsGap);
00545       if (NewComs == ComsV.Last().Val) { NewComs++; }
00546       ComsV.Add(NewComs);
00547     }
00548     if (ComsV.Last() < MaxComs) { ComsV.Add(MaxComs); }
00549     return FindComsByCV(ComsV, 0.1, NumThreads, OutFNm + ".CV.likelihood", StepAlpha, StepBeta);
00550 }
00551 
00552 int TAGMFast::FindComsByCV(TIntV& ComsV, const double HOFrac, const int NumThreads, const TStr PlotLFNm, const double StepAlpha, const double StepBeta) {
00553   if (ComsV.Len() == 0) {
00554     int MaxComs = G->GetNodes() / 5;
00555     ComsV.Add(2);
00556     while(ComsV.Last() < MaxComs) { ComsV.Add(ComsV.Last() * 2); }
00557   }
00558   TIntPrV EdgeV(G->GetEdges(), 0);
00559   for (TUNGraph::TEdgeI EI = G->BegEI(); EI < G->EndEI(); EI++) {
00560     EdgeV.Add(TIntPr(EI.GetSrcNId(), EI.GetDstNId()));
00561   }
00562   EdgeV.Shuffle(Rnd);
00563   int MaxIterCV = 3;
00564 
00565   TVec<TVec<TIntSet> > HoldOutSets(MaxIterCV);
00566   if (EdgeV.Len() > 50) { //if edges are many enough, use CV
00567     printf("generating hold out set\n");
00568     TIntV NIdV1, NIdV2;
00569     G->GetNIdV(NIdV1);
00570     G->GetNIdV(NIdV2);
00571     for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
00572       // generate holdout sets
00573       HoldOutSets[IterCV].Gen(G->GetNodes());
00574       const int HOTotal = int(HOFrac * G->GetNodes() * (G->GetNodes() - 1) / 2.0);
00575       int HOCnt = 0;
00576       int HOEdges = (int) TMath::Round(HOFrac * G->GetEdges());
00577       printf("holding out %d edges...\n", HOEdges);
00578       for (int he = 0; he < (int) HOEdges; he++) {
00579         HoldOutSets[IterCV][EdgeV[he].Val1].AddKey(EdgeV[he].Val2);
00580         HoldOutSets[IterCV][EdgeV[he].Val2].AddKey(EdgeV[he].Val1);
00581         HOCnt++;
00582       }
00583       printf("%d Edges hold out\n", HOCnt);
00584       while(HOCnt++ < HOTotal) {
00585         int SrcNID = Rnd.GetUniDevInt(G->GetNodes());
00586         int DstNID = Rnd.GetUniDevInt(G->GetNodes());
00587         HoldOutSets[IterCV][SrcNID].AddKey(DstNID);
00588         HoldOutSets[IterCV][DstNID].AddKey(SrcNID);
00589       }
00590     }
00591     printf("hold out set generated\n");
00592   }
00593 
00594   TFltV HOLV(ComsV.Len());
00595   TIntFltPrV ComsLV;
00596   for (int c = 0; c < ComsV.Len(); c++) {
00597     const int Coms = ComsV[c];
00598     printf("Try number of Coms:%d\n", Coms);
00599     NeighborComInit(Coms);
00600     printf("Initialized\n");
00601 
00602     if (EdgeV.Len() > 50) { //if edges are many enough, use CV
00603       for (int IterCV = 0; IterCV < MaxIterCV; IterCV++) {
00604         HOVIDSV = HoldOutSets[IterCV];
00605 
00606         if (NumThreads == 1) {
00607           printf("MLE without parallelization begins\n");
00608           MLEGradAscent(0.05, 10 * G->GetNodes(), "", StepAlpha, StepBeta);
00609         } else {
00610           printf("MLE with parallelization begins\n");
00611           MLEGradAscentParallel(0.05, 100, NumThreads, "", StepAlpha, StepBeta);
00612         }
00613         double HOL = LikelihoodHoldOut();
00614         HOL = HOL < 0? HOL: TFlt::Mn;
00615         HOLV[c] += HOL;
00616       }
00617     }
00618     else {
00619       HOVIDSV.Gen(G->GetNodes());
00620       MLEGradAscent(0.0001, 100 * G->GetNodes(), "");
00621       double BIC = 2 * Likelihood() - (double) G->GetNodes() * Coms * 2.0 * log ( (double) G->GetNodes());
00622       HOLV[c] = BIC;
00623     }
00624   }
00625   int EstComs = 2;
00626   double MaxL = TFlt::Mn;
00627   printf("\n");
00628   for (int c = 0; c < ComsV.Len(); c++) {
00629     ComsLV.Add(TIntFltPr(ComsV[c].Val, HOLV[c].Val));
00630     printf("%d(%f)\t", ComsV[c].Val, HOLV[c].Val);
00631     if (MaxL < HOLV[c]) {
00632       MaxL = HOLV[c];
00633       EstComs = ComsV[c];
00634     }
00635   }
00636   printf("\n");
00637   RandomInit(EstComs);
00638   HOVIDSV.Gen(G->GetNodes());
00639   if (! PlotLFNm.Empty()) {
00640     TGnuPlot::PlotValV(ComsLV, PlotLFNm, "hold-out likelihood", "communities", "likelihood");
00641   }
00642   return EstComs;
00643 }
00644 
00645 double TAGMFast::LikelihoodHoldOut(const bool DoParallel) { 
00646   double L = 0.0;
00647   for (int u = 0; u < HOVIDSV.Len(); u++) {
00648     for (int e = 0; e < HOVIDSV[u].Len(); e++) {
00649       int VID = HOVIDSV[u][e];
00650       if (VID == u) { continue; } 
00651       double Pred = Prediction(u, VID);
00652       if (G->IsEdge(u, VID)) {
00653         L += log(1.0 - Pred);
00654       }
00655       else {
00656         L += NegWgt * log(Pred);
00657       }
00658     }
00659   }
00660   return L;
00661 }
00662 
00663 double TAGMFast::GetStepSizeByLineSearch(const int UID, const TIntFltH& DeltaV, const TIntFltH& GradV, const double& Alpha, const double& Beta, const int MaxIter) {
00664   double StepSize = 1.0;
00665   double InitLikelihood = LikelihoodForRow(UID);
00666   TIntFltH NewVarV(DeltaV.Len());
00667   for(int iter = 0; iter < MaxIter; iter++) {
00668     for (int i = 0; i < DeltaV.Len(); i++){
00669       int CID = DeltaV.GetKey(i);
00670       double NewVal = GetCom(UID, CID) + StepSize * DeltaV.GetDat(CID);
00671       if (NewVal < MinVal) { NewVal = MinVal; }
00672       if (NewVal > MaxVal) { NewVal = MaxVal; }
00673       NewVarV.AddDat(CID, NewVal);
00674     }
00675     if (LikelihoodForRow(UID, NewVarV) < InitLikelihood + Alpha * StepSize * DotProduct(GradV, DeltaV)) {
00676       StepSize *= Beta;
00677     } else {
00678       break;
00679     }
00680     if (iter == MaxIter - 1) { 
00681       StepSize = 0.0;
00682       break;
00683     }
00684   }
00685   return StepSize;
00686 }
00687 
00688 int TAGMFast::MLEGradAscent(const double& Thres, const int& MaxIter, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
00689   time_t InitTime = time(NULL);
00690   TExeTm ExeTm, CheckTm;
00691   int iter = 0, PrevIter = 0;
00692   TIntFltPrV IterLV;
00693   TUNGraph::TNodeI UI;
00694   double PrevL = TFlt::Mn, CurL = 0.0;
00695   TIntV NIdxV(F.Len(), 0);
00696   for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
00697   IAssert(NIdxV.Len() == F.Len());
00698   TIntFltH GradV;
00699   while(iter < MaxIter) {
00700     NIdxV.Shuffle(Rnd);
00701     for (int ui = 0; ui < F.Len(); ui++, iter++) {
00702       int u = NIdxV[ui]; //
00703       //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
00704       UI = G->GetNI(u);
00705       TIntSet CIDSet(5 * UI.GetDeg());
00706       for (int e = 0; e < UI.GetDeg(); e++) {
00707         if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
00708         TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
00709         for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
00710           CIDSet.AddKey(CI.GetKey());
00711         }
00712       }
00713       for (TIntFltH::TIter CI = F[u].BegI(); CI < F[u].EndI(); CI++) { //remove the community membership which U does not share with its neighbors
00714         if (! CIDSet.IsKey(CI.GetKey())) {
00715           DelCom(u, CI.GetKey());
00716         }
00717       }
00718       if (CIDSet.Empty()) { continue; }
00719       GradientForRow(u, GradV, CIDSet);
00720       if (Norm2(GradV) < 1e-4) { continue; }
00721       double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta);
00722       if (LearnRate == 0.0) { continue; }
00723       for (int ci = 0; ci < GradV.Len(); ci++) {
00724         int CID = GradV.GetKey(ci);
00725         double Change = LearnRate * GradV.GetDat(CID);
00726         double NewFuc = GetCom(u, CID) + Change;
00727         if (NewFuc <= 0.0) {
00728           DelCom(u, CID);
00729         } else {
00730           AddCom(u, CID, NewFuc);
00731         }
00732       }
00733       if (! PlotNm.Empty() && (iter + 1) % G->GetNodes() == 0) {
00734         IterLV.Add(TIntFltPr(iter, Likelihood(false)));
00735       }
00736     }
00737     printf("\r%d iterations (%f) [%lu sec]", iter, CurL, time(NULL) - InitTime);
00738     fflush(stdout);
00739     if (iter - PrevIter >= 2 * G->GetNodes() && iter > 10000) {
00740       PrevIter = iter;
00741       CurL = Likelihood();
00742       if (PrevL > TFlt::Mn && ! PlotNm.Empty()) {
00743         printf("\r%d iterations, Likelihood: %f, Diff: %f", iter, CurL,  CurL - PrevL);
00744       }
00745       fflush(stdout);
00746       if (CurL - PrevL <= Thres * fabs(PrevL)) { break; }
00747       else { PrevL = CurL; }
00748     }
00749     
00750   }
00751   printf("\n");
00752   printf("MLE for Lambda completed with %d iterations(%s)\n", iter, ExeTm.GetTmStr());
00753   if (! PlotNm.Empty()) {
00754     TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
00755   }
00756   return iter;
00757 }
00758 
00759 int TAGMFast::MLEGradAscentParallel(const double& Thres, const int& MaxIter, const int ChunkNum, const int ChunkSize, const TStr PlotNm, const double StepAlpha, const double StepBeta) {
00760   //parallel
00761   time_t InitTime = time(NULL);
00762   uint64 StartTm = TSecTm::GetCurTm().GetAbsSecs();
00763   TExeTm ExeTm, CheckTm;
00764   double PrevL = Likelihood(true);
00765   TIntFltPrV IterLV;
00766   int PrevIter = 0;
00767   int iter = 0;
00768   TIntV NIdxV(F.Len(), 0);
00769   for (int i = 0; i < F.Len(); i++) { NIdxV.Add(i); }
00770   TIntV NIDOPTV(F.Len()); //check if a node needs optimization or not 1: does not require optimization
00771   NIDOPTV.PutAll(0);
00772   TVec<TIntFltH> NewF(ChunkNum * ChunkSize);
00773   TIntV NewNIDV(ChunkNum * ChunkSize);
00774   for (iter = 0; iter < MaxIter; iter++) {
00775     NIdxV.Clr(false);
00776     for (int i = 0; i < F.Len(); i++) { 
00777       if (NIDOPTV[i] == 0) {  NIdxV.Add(i); }
00778     }
00779     IAssert (NIdxV.Len() <= F.Len());
00780     NIdxV.Shuffle(Rnd);
00781     // compute gradient for chunk of nodes
00782 #pragma omp parallel for schedule(static, 1)
00783     for (int TIdx = 0; TIdx < ChunkNum; TIdx++) {
00784       TIntFltH GradV;
00785       for (int ui = TIdx * ChunkSize; ui < (TIdx + 1) * ChunkSize; ui++) {
00786         NewNIDV[ui] = -1;
00787         if (ui > NIdxV.Len()) { continue; }
00788         int u = NIdxV[ui]; //
00789         //find set of candidate c (we only need to consider c to which a neighbor of u belongs to)
00790         TUNGraph::TNodeI UI = G->GetNI(u);
00791         TIntSet CIDSet(5 * UI.GetDeg());
00792         TIntFltH CurFU = F[u];
00793         for (int e = 0; e < UI.GetDeg(); e++) {
00794           if (HOVIDSV[u].IsKey(UI.GetNbrNId(e))) { continue; }
00795           TIntFltH& NbhCIDH = F[UI.GetNbrNId(e)];
00796           for (TIntFltH::TIter CI = NbhCIDH.BegI(); CI < NbhCIDH.EndI(); CI++) {
00797             CIDSet.AddKey(CI.GetKey());
00798           }
00799         }
00800         if (CIDSet.Empty()) { 
00801           CurFU.Clr();
00802         }
00803         else {
00804           for (TIntFltH::TIter CI = CurFU.BegI(); CI < CurFU.EndI(); CI++) { //remove the community membership which U does not share with its neighbors
00805             if (! CIDSet.IsKey(CI.GetKey())) {
00806               CurFU.DelIfKey(CI.GetKey());
00807             }
00808           }
00809           GradientForRow(u, GradV, CIDSet);
00810           if (Norm2(GradV) < 1e-4) { NIDOPTV[u] = 1; continue; }
00811           double LearnRate = GetStepSizeByLineSearch(u, GradV, GradV, StepAlpha, StepBeta, 5);
00812           if (LearnRate <= 1e-5) { NewNIDV[ui] = -2; continue; }
00813           for (int ci = 0; ci < GradV.Len(); ci++) {
00814             int CID = GradV.GetKey(ci);
00815             double Change = LearnRate * GradV.GetDat(CID);
00816             double NewFuc = CurFU.IsKey(CID)? CurFU.GetDat(CID) + Change : Change;
00817             if (NewFuc <= 0.0) {
00818               CurFU.DelIfKey(CID);
00819             } else {
00820               CurFU.AddDat(CID) = NewFuc;
00821             }
00822           }
00823           CurFU.Defrag();
00824         }
00825         //store changes
00826         NewF[ui] = CurFU;
00827         NewNIDV[ui] = u;
00828       }
00829     }
00830     int NumNoChangeGrad = 0;
00831     int NumNoChangeStepSize = 0;
00832     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00833       int NewNID = NewNIDV[ui];
00834       if (NewNID == -1) { NumNoChangeGrad++; continue; }
00835       if (NewNID == -2) { NumNoChangeStepSize++; continue; }
00836       for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
00837         SumFV[CI.GetKey()] -= CI.GetDat();
00838       }
00839     }
00840 #pragma omp parallel for
00841     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00842       int NewNID = NewNIDV[ui];
00843       if (NewNID < 0) { continue; }
00844       F[NewNID] = NewF[ui];
00845     }
00846     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00847       int NewNID = NewNIDV[ui];
00848       if (NewNID < 0) { continue; }
00849       for (TIntFltH::TIter CI = F[NewNID].BegI(); CI < F[NewNID].EndI(); CI++) {
00850         SumFV[CI.GetKey()] += CI.GetDat();
00851       }
00852     }
00853     // update the nodes who are optimal
00854     for (int ui = 0; ui < NewNIDV.Len(); ui++) {
00855       int NewNID = NewNIDV[ui];
00856       if (NewNID < 0) { continue; }
00857       TUNGraph::TNodeI UI = G->GetNI(NewNID);
00858       NIDOPTV[NewNID] = 0;
00859       for (int e = 0; e < UI.GetDeg(); e++) {
00860         NIDOPTV[UI.GetNbrNId(e)] = 0;
00861       }
00862     }
00863     int OPTCnt = 0;
00864     for (int i = 0; i < NIDOPTV.Len(); i++) { if (NIDOPTV[i] == 1) { OPTCnt++; } }
00865     if (! PlotNm.Empty()) {
00866       printf("\r%d iterations [%s] %d secs", iter * ChunkSize * ChunkNum, ExeTm.GetTmStr(), int(TSecTm::GetCurTm().GetAbsSecs() - StartTm));
00867       if (PrevL > TFlt::Mn) { printf(" (%f) %d g %d s %d OPT", PrevL, NumNoChangeGrad, NumNoChangeStepSize, OPTCnt); }
00868       fflush(stdout);
00869     }
00870     if ((iter - PrevIter) * ChunkSize * ChunkNum >= G->GetNodes()) {
00871       PrevIter = iter;
00872       double CurL = Likelihood(true);
00873       IterLV.Add(TIntFltPr(iter * ChunkSize * ChunkNum, CurL));
00874       printf("\r%d iterations, Likelihood: %f, Diff: %f [%d secs]", iter, CurL,  CurL - PrevL, int(time(NULL) - InitTime));
00875        fflush(stdout);
00876       if (CurL - PrevL <= Thres * fabs(PrevL)) { 
00877         break;
00878       }
00879       else {
00880         PrevL = CurL;
00881       }
00882     }
00883   }
00884   if (! PlotNm.Empty()) {
00885     printf("\nMLE completed with %d iterations(%d secs)\n", iter, int(TSecTm::GetCurTm().GetAbsSecs() - StartTm));
00886     TGnuPlot::PlotValV(IterLV, PlotNm + ".likelihood_Q");
00887   } else {
00888     printf("\rMLE completed with %d iterations(%d secs)", iter, int(time(NULL) - InitTime));
00889     fflush(stdout);
00890   }
00891   return iter;
00892 }