SNAP Library 2.2, Developer Reference  2014-03-11 19:15:55
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
cmty.cpp
Go to the documentation of this file.
00001 
00002 // Community detection algorithms
00003 namespace TSnap {
00004 
00005 
00006 namespace TSnapDetail {
00007 // GIRVAN-NEWMAN algorithm
00008 //      1. The betweenness of all existing edges in the network is calculated first.
00009 //      2. The edge with the highest betweenness is removed.
00010 //      3. The betweenness of all edges affected by the removal is recalculated.
00011 //      4. Steps 2 and 3 are repeated until no edges remain.
00012 //  Girvan M. and Newman M. E. J., Community structure in social and biological networks, Proc. Natl. Acad. Sci. USA 99, 7821-7826 (2002)
00013 // Keep removing edges from Graph until one of the connected components of Graph splits into two.
00014 void CmtyGirvanNewmanStep(PUNGraph& Graph, TIntV& Cmty1, TIntV& Cmty2) {
00015   TIntPrFltH BtwEH;
00016   TBreathFS<PUNGraph> BFS(Graph);
00017   Cmty1.Clr(false);  Cmty2.Clr(false);
00018   while (true) {
00019     TSnap::GetBetweennessCentr(Graph, BtwEH);
00020     BtwEH.SortByDat(false);
00021     if (BtwEH.Empty()) { return; }
00022     const int NId1 = BtwEH.GetKey(0).Val1;
00023     const int NId2 = BtwEH.GetKey(0).Val2;
00024     Graph->DelEdge(NId1, NId2);
00025     BFS.DoBfs(NId1, true, false, NId2, TInt::Mx);
00026     if (BFS.GetHops(NId1, NId2) == -1) { // two components
00027       TSnap::GetNodeWcc(Graph, NId1, Cmty1);
00028       TSnap::GetNodeWcc(Graph, NId2, Cmty2);
00029       return;
00030     }
00031   }
00032 }
00033 
00034 // Connected components of a graph define clusters
00035 // OutDegH and OrigEdges stores node degrees and number of edges in the original graph
00036 double _GirvanNewmanGetModularity(const PUNGraph& G, const TIntH& OutDegH, const int& OrigEdges, TCnComV& CnComV) {
00037   TSnap::GetWccs(G, CnComV); // get communities
00038   double Mod = 0;
00039   for (int c = 0; c < CnComV.Len(); c++) {
00040     const TIntV& NIdV = CnComV[c]();
00041     double EIn=0, EEIn=0;
00042     for (int i = 0; i < NIdV.Len(); i++) {
00043       TUNGraph::TNodeI NI = G->GetNI(NIdV[i]);
00044       EIn += NI.GetOutDeg();
00045       EEIn += OutDegH.GetDat(NIdV[i]);
00046     }
00047     Mod += (EIn-EEIn*EEIn/(2.0*OrigEdges));
00048   }
00049   if (Mod == 0) { return 0; }
00050   else { return Mod/(2.0*OrigEdges); }
00051 }
00052 
00053 TIntFltH MapEquationNew2Modules(PUNGraph& Graph, TIntH& Module, TIntFltH& Qi, int a, int b){
00054   TIntFltH Qi1;
00055   Qi1 = Qi;     
00056   float InModule=0.0, OutModule=0.0, Val;
00057   int Mds[2] = {a,b};
00058   for (int i=0; i<2; i++) {
00059     InModule=0.0, OutModule=0.0;
00060     if (Qi1.IsKey(Mds[i])){
00061       int CentralModule = Mds[i];
00062       for (TUNGraph::TEdgeI EI = Graph->BegEI(); EI < Graph->EndEI(); EI++) {
00063         if (Module.GetDat(EI.GetSrcNId()) == Module.GetDat(EI.GetDstNId()) && Module.GetDat(EI.GetDstNId()) == CentralModule) {
00064           InModule += 1.0;
00065         } else if ((Module.GetDat(EI.GetSrcNId()) == CentralModule && Module.GetDat(EI.GetDstNId()) != CentralModule) || (Module.GetDat(EI.GetSrcNId()) != CentralModule && Module.GetDat(EI.GetDstNId()) == CentralModule)) {
00066           OutModule +=1.0;
00067         }
00068             }
00069       Val = 0.0;
00070       if (InModule+OutModule > 0) {
00071         Val = OutModule/(InModule+OutModule);
00072       }
00073       Qi1.DelKey(Mds[i]);
00074       Qi1.AddDat(Mds[i],Val);
00075     } else {
00076       Qi1.DelKey(Mds[i]);
00077       Qi1.AddDat(Mds[i],0.0);
00078     }
00079   }
00080         
00081   return Qi1;
00082 }
00083 
00084 float Equation(PUNGraph& Graph, TIntFltH& PAlpha,float& SumPAlphaLogPAlpha, TIntFltH& Qi){
00085   float SumPAlpha = 1.0, SumQi = 0.0, SumQiLogQi=0.0, SumQiSumPAlphaLogQiSumPAlpha = 0.0;
00086   for (int i=0; i<Qi.Len(); i++) {
00087     SumQi += Qi[i];
00088     SumQiLogQi += Qi[i]*log(Qi[i]);
00089     SumQiSumPAlphaLogQiSumPAlpha += (Qi[i]+SumPAlpha)*log(Qi[i]+SumPAlpha);
00090   }
00091   return (SumQi*log(SumQi)-2*SumQiLogQi-SumPAlphaLogPAlpha+SumQiSumPAlphaLogQiSumPAlpha);
00092 }
00093 
00094 } // namespace TSnapDetail
00095 
00096 // Maximum modularity clustering by Girvan-Newman algorithm (slow)
00097 //  Girvan M. and Newman M. E. J., Community structure in social and biological networks, Proc. Natl. Acad. Sci. USA 99, 7821-7826 (2002)
00098 double CommunityGirvanNewman(PUNGraph& Graph, TCnComV& CmtyV) {
00099   TIntH OutDegH;
00100   const int NEdges = Graph->GetEdges();
00101   for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
00102     OutDegH.AddDat(NI.GetId(), NI.GetOutDeg());
00103   }
00104   double BestQ = -1; // modularity
00105   TCnComV CurCmtyV;
00106   CmtyV.Clr();
00107   TIntV Cmty1, Cmty2;
00108   while (true) {
00109     TSnapDetail::CmtyGirvanNewmanStep(Graph, Cmty1, Cmty2);
00110     const double Q = TSnapDetail::_GirvanNewmanGetModularity(Graph, OutDegH, NEdges, CurCmtyV);
00111     //printf("current modularity: %f\n", Q);
00112     if (Q > BestQ) {
00113       BestQ = Q; 
00114       CmtyV.Swap(CurCmtyV);
00115     }
00116     if (Cmty1.Len()==0 || Cmty2.Len() == 0) { break; }
00117   }
00118   return BestQ;
00119 }
00120 
00121 // Rosvall-Bergstrom community detection algorithm based on information theoretic approach.
00122 // See: Rosvall M., Bergstrom C. T., Maps of random walks on complex networks reveal community structure, Proc. Natl. Acad. Sci. USA 105, 1118-1123 (2008)
00123 double Infomap(PUNGraph& Graph, TCnComV& CmtyV){        
00124   TIntH DegH; 
00125   TIntFltH PAlpha; // probability of visiting node alpha
00126   TIntH Module; // module of each node
00127   TIntFltH Qi; // probability of leaving each module
00128   float SumPAlphaLogPAlpha = 0.0;
00129   int Br = 0;
00130   const int e = Graph->GetEdges(); 
00131 
00132   // initial values
00133   for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
00134     DegH.AddDat(NI.GetId(), NI.GetDeg());
00135     float d = ((float)NI.GetDeg()/(float)(2*e));
00136     PAlpha.AddDat(NI.GetId(), d);
00137     SumPAlphaLogPAlpha += d*log(d);
00138     Module.AddDat(NI.GetId(),Br);
00139     Qi.AddDat(Module[Br],1.0);
00140     Br+=1;
00141   }
00142 
00143   float MinCodeLength = TSnapDetail::Equation(Graph,PAlpha,SumPAlphaLogPAlpha,Qi);
00144   float NewCodeLength, PrevIterationCodeLength = 0.0;
00145   int OldModule, NewModule;
00146 
00147   do {
00148     PrevIterationCodeLength = MinCodeLength;
00149       for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
00150         MinCodeLength = TSnapDetail::Equation(Graph, PAlpha, SumPAlphaLogPAlpha, Qi);
00151         for (int i=0; i<DegH.GetDat(NI.GetId()); i++) {
00152           OldModule = Module.GetDat(NI.GetId());
00153           NewModule = Module.GetDat(NI.GetNbrNId(i));
00154           if (OldModule!=NewModule){
00155             Module.DelKey(NI.GetId()); 
00156             Module.AddDat(NI.GetId(),NewModule);
00157             Qi = TSnapDetail::MapEquationNew2Modules(Graph,Module,Qi,OldModule, NewModule);
00158             NewCodeLength = TSnapDetail::Equation(Graph,PAlpha,SumPAlphaLogPAlpha, Qi);
00159             if (NewCodeLength<MinCodeLength) {
00160               MinCodeLength = NewCodeLength;
00161               OldModule = NewModule;
00162             } else {
00163               Module.DelKey(NI.GetId());
00164               Module.AddDat(NI.GetId(),OldModule);
00165             }
00166           }
00167        }
00168      }
00169    } while (MinCodeLength<PrevIterationCodeLength);
00170 
00171   Module.SortByDat(true);
00172   int Mod=-1;
00173   for (int i=0; i<Module.Len(); i++) {
00174     if (Module[i]>Mod){
00175       Mod = Module[i];
00176       TCnCom t;
00177       for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++){
00178         if (Module.GetDat(NI.GetId())==Mod)
00179         t.Add(NI.GetId());
00180       }
00181       CmtyV.Add(t);
00182     }
00183   }
00184 
00185   return MinCodeLength;
00186 }
00187 
00188 namespace TSnapDetail {
00192 class TCNMQMatrix {
00193 private:
00194   struct TCmtyDat {
00195     double DegFrac;
00196     TIntFltH NIdQH;
00197     int MxQId;
00198     TCmtyDat() : MxQId(-1) { }
00199     TCmtyDat(const double& NodeDegFrac, const int& OutDeg) : 
00200       DegFrac(NodeDegFrac), NIdQH(OutDeg), MxQId(-1) { }
00201     void AddQ(const int& NId, const double& Q) { NIdQH.AddDat(NId, Q);
00202       if (MxQId==-1 || NIdQH[MxQId]<Q) { MxQId=NIdQH.GetKeyId(NId); } }
00203     void UpdateMaxQ() { MxQId=-1; 
00204       for (int i = -1; NIdQH.FNextKeyId(i); ) { 
00205         if (MxQId==-1 || NIdQH[MxQId]< NIdQH[i]) { MxQId=i; } } }
00206     void DelLink(const int& K) { const int NId=GetMxQNId(); 
00207       NIdQH.DelKey(K); if (NId==K) { UpdateMaxQ(); }  }
00208     int GetMxQNId() const { return NIdQH.GetKey(MxQId); }
00209     double GetMxQ() const { return NIdQH[MxQId]; }
00210   };
00211 private:
00212   THash<TInt, TCmtyDat> CmtyQH;
00213   THeap<TFltIntIntTr> MxQHeap;
00214   TUnionFind CmtyIdUF;
00215   double Q;
00216 public:
00217   TCNMQMatrix(const PUNGraph& Graph) : CmtyQH(Graph->GetNodes()), 
00218     MxQHeap(Graph->GetNodes()), CmtyIdUF(Graph->GetNodes()) { Init(Graph); }
00219   void Init(const PUNGraph& Graph) {
00220     const double M = 0.5/Graph->GetEdges(); // 1/2m
00221     Q = 0.0;
00222     for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
00223       CmtyIdUF.Add(NI.GetId());
00224       const int OutDeg = NI.GetOutDeg();
00225       if (OutDeg == 0) { continue; }
00226       TCmtyDat& Dat = CmtyQH.AddDat(NI.GetId(), TCmtyDat(M * OutDeg, OutDeg));
00227       for (int e = 0; e < NI.GetOutDeg(); e++) {
00228         const int DstNId = NI.GetOutNId(e);
00229         const double DstMod = 2 * M * (1.0 - OutDeg * Graph->GetNI(DstNId).GetOutDeg() * M);
00230         Dat.AddQ(DstNId, DstMod);
00231       }
00232       Q += -1.0*TMath::Sqr(OutDeg*M);
00233       if (NI.GetId() < Dat.GetMxQNId()) {
00234         MxQHeap.Add(TFltIntIntTr(Dat.GetMxQ(), NI.GetId(), Dat.GetMxQNId())); }
00235     }
00236     MxQHeap.MakeHeap();
00237   }
00238   TFltIntIntTr FindMxQEdge() {
00239     while (true) {
00240       if (MxQHeap.Empty()) { break; }
00241       const TFltIntIntTr TopQ = MxQHeap.PopHeap();
00242       if (! CmtyQH.IsKey(TopQ.Val2) || ! CmtyQH.IsKey(TopQ.Val3)) { continue; }
00243       if (TopQ.Val1!=CmtyQH.GetDat(TopQ.Val2).GetMxQ() && TopQ.Val1!=CmtyQH.GetDat(TopQ.Val3).GetMxQ()) { continue; }
00244       return TopQ;
00245     }
00246     return TFltIntIntTr(-1, -1, -1);
00247   }
00248   bool MergeBestQ() {
00249     const TFltIntIntTr TopQ = FindMxQEdge();
00250     if (TopQ.Val1 <= 0.0) { return false; }
00251     // joint communities
00252     const int I = TopQ.Val3;
00253     const int J = TopQ.Val2;
00254     CmtyIdUF.Union(I, J); // join
00255     Q += TopQ.Val1;
00256     TCmtyDat& DatJ = CmtyQH.GetDat(J);
00257     { TCmtyDat& DatI = CmtyQH.GetDat(I);
00258     DatI.DelLink(J);  DatJ.DelLink(I);
00259     for (int i = -1; DatJ.NIdQH.FNextKeyId(i); ) {
00260       const int K = DatJ.NIdQH.GetKey(i);
00261       TCmtyDat& DatK = CmtyQH.GetDat(K);
00262       double NewQ = DatJ.NIdQH[i];
00263       if (DatI.NIdQH.IsKey(K)) { NewQ = NewQ+DatI.NIdQH.GetDat(K);  DatK.DelLink(I); }     // K connected to I and J
00264       else { NewQ = NewQ-2*DatI.DegFrac*DatK.DegFrac; }  // K connected to J not I
00265       DatJ.AddQ(K, NewQ);
00266       DatK.AddQ(J, NewQ);
00267       MxQHeap.PushHeap(TFltIntIntTr(NewQ, TMath::Mn(J,K), TMath::Mx(J,K)));
00268     }
00269     for (int i = -1; DatI.NIdQH.FNextKeyId(i); ) {
00270       const int K = DatI.NIdQH.GetKey(i);
00271       if (! DatJ.NIdQH.IsKey(K)) { // K connected to I not J
00272         TCmtyDat& DatK = CmtyQH.GetDat(K);
00273         const double NewQ = DatI.NIdQH[i]-2*DatJ.DegFrac*DatK.DegFrac; 
00274         DatJ.AddQ(K, NewQ);
00275         DatK.DelLink(I);
00276         DatK.AddQ(J, NewQ);
00277         MxQHeap.PushHeap(TFltIntIntTr(NewQ, TMath::Mn(J,K), TMath::Mx(J,K)));
00278       }
00279     } 
00280     DatJ.DegFrac += DatI.DegFrac; }
00281     if (DatJ.NIdQH.Empty()) { CmtyQH.DelKey(J); } // isolated community (done)
00282     CmtyQH.DelKey(I);
00283     return true;
00284   }
00285   static double CmtyCMN(const PUNGraph& Graph, TCnComV& CmtyV) {
00286     TCNMQMatrix QMatrix(Graph);
00287     // maximize modularity
00288     while (QMatrix.MergeBestQ()) { }
00289     // reconstruct communities
00290     THash<TInt, TIntV> IdCmtyH;
00291     for (TUNGraph::TNodeI NI = Graph->BegNI(); NI < Graph->EndNI(); NI++) {
00292       IdCmtyH.AddDat(QMatrix.CmtyIdUF.Find(NI.GetId())).Add(NI.GetId()); 
00293     }
00294     CmtyV.Gen(IdCmtyH.Len());
00295     for (int j = 0; j < IdCmtyH.Len(); j++) {
00296       CmtyV[j].NIdV.Swap(IdCmtyH[j]);
00297     }
00298     return QMatrix.Q;
00299   }
00300 };
00301 
00302 } // namespace TSnapDetail
00303 
00304 double CommunityCNM(const PUNGraph& Graph, TCnComV& CmtyV) {
00305   return TSnapDetail::TCNMQMatrix::CmtyCMN(Graph, CmtyV);
00306 }
00307 
00308 }; //namespace TSnap