SNAP Library 4.0, Developer Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
randwalk.h
Go to the documentation of this file.
1 //
2 // randwalk.h
3 // snap-core
4 //
5 // Created by Peter Lofgren on 8/5/15.
6 // Copyright (c) 2015 infolab. All rights reserved.
7 //
8 #ifndef snap_core_randwalk_h
9 #define snap_core_randwalk_h
10 
11 #include <stdio.h>
12 #include "priorityqueue.h"
13 #include "Snap.h"
14 
15 // use anonymous namespace to limit name to this file.
16 namespace {
17  double WallClockTime() {
18  // from http://stackoverflow.com/questions/588307/c-obtaining-milliseconds-time-on-linux-clock-doesnt-seem-to-work-properl/
19  struct timeval t;
20  gettimeofday(&t, NULL);
21  return t.tv_sec + t.tv_usec / 1.0e6;
22  }
23 
24  // Given a target nodeId, computes results in two maps Estimates and Residuals and one value MaxResidual.
25  // Each value Estimates.GetDat(v)
26  // approximates ppr(v, target) with additive error MaxResidual, and the residuals satisfy a
27  // useful loop invariant. See the paper "Local Computation of PageRank Contributions" for details.
28  // This variant is for the balanced variant of bidirectional ppr. It does reverse pushes, decreasing maxResidual incrementally,
29  // until the time spent equals the remaining time required for forward walks, as estimated using ForwardSecondsRMaxRatio
30  // (which is the ratio between the expected forward random walk time and the current MaxResidual that hasn't been pushed back).
31  template <class PGraph>
32  void ApproxContributionsBalanced(const PGraph& Graph,
33  double JumpProb,
34  int TargetNId,
35  float ForwardSecondsRMaxRatio,
36  TIntFltH& ResultEstimates,
37  TIntFltH& ResultResiduals,
38  float& ResultMaxResidual) {
39  double startTime = WallClockTime();
40  TMaxPriorityQueue<TInt> nodesByResidual;
41  nodesByResidual.Insert(TargetNId, 1.0f);
42  while (!nodesByResidual.IsEmpty() &&
43  WallClockTime() - startTime < ForwardSecondsRMaxRatio * nodesByResidual.GetMaxPriority()) {
44  float vPriority = nodesByResidual.GetMaxPriority();
45  int vId = nodesByResidual.PopMax();
46  ResultEstimates(vId) = ResultEstimates(vId) + JumpProb * vPriority;
47  //printf("set estimate(%d) to %g\n", vId, double(ResultEstimates(vId)));
48  TNGraph::TNodeI v = Graph->GetNI(vId);
49  for (int i = 0; i < v.GetInDeg(); i++) {
50  int uId = v.GetInNId(i);
51  TNGraph::TNodeI u = Graph->GetNI(uId);
52  float residualChange = (1.0 - JumpProb) * vPriority / u.GetOutDeg();
53  nodesByResidual.SetPriority(uId, nodesByResidual.GetPriority(uId) + residualChange);
54  //printf("set priority(%d) to %g\n", uId, double(nodesByResidual.GetPriority(uId)));
55  }
56  }
57 
58  // Copy residuals from nodesByResidual to ResultResiduals
59  nodesByResidual.GetPriorities(ResultResiduals);
60  ResultMaxResidual = nodesByResidual.IsEmpty() ? 0.0f : nodesByResidual.GetMaxPriority();
61  }
62 }
63 
64 namespace TSnap {
65 // Returns the endpoint of a random walk sampled from a random start node in StartNIdV. The walk has expected length 1/JumpProb, and restarts if it reaches a dead-end node (one with no out-neighbors).
66 template <class PGraph>
67 int SamplePersonalizedPageRank(const PGraph& Graph, double JumpProb, const TIntV& StartNIdV, TRnd& Rnd) {
68  int locationId = StartNIdV.GetRndVal(Rnd);
69  //printf("starting walk at %d\n", locationId);
70  while (Rnd.GetUniDev() >= JumpProb) {
71  TNGraph::TNodeI location = Graph->GetNI(locationId);
72  int d = location.GetOutDeg();
73  if (d > 0)
74  locationId = location.GetOutNId(Rnd.GetUniDevInt(d));
75  else
76  locationId = StartNIdV.GetRndVal(Rnd);
77  }
78  return locationId;
79 }
80 
81 // Returns the Personalized PageRank from given vector of start node ids to the given target. This is equal to the probability
82 // that a random walk from a source uniformly sampled from the given start node ids stops at the given target node id, where
83 // the walk stops after each stop with probability JumpProb. As long as the true PPR is greater than MinProbability (which defaults
84 // to 1/n on an n node graph), the mean relative error will be approximately the given RelativeError. If provableRelativeError is set to
85 // true, the result will provably (with high probability) have at most the given RelativeError, but this comes at the cost of significantly greater
86 // running time. Uses the algorithm presented in "Personalized PageRank Estimation and Search: A Bidirectional Approach" by Lofgren, Banerjee, and Goel.
87 template <class PGraph>
88  double GetPersonalizedPageRankBidirectional(const PGraph& Graph,
89  double JumpProb,
90  const TIntV& StartNIdV,
91  int TargetNId,
92  double MinProbability = -1.0,
93  double RelativeError = 0.1,
94  bool provableRelativeError = false,
95  bool PrintTimeForTuning = false) {
96  if (MinProbability <= 0.0) { // Check if minProbability not set.
97  MinProbability = 1.0 / Graph->GetNodes();
98  }
99  // In experiments, when relativeError = 0.1, a chernoff constant of 0.07 gave mean relative error less than 0.1 on several realistic graphs.
100  float kChernoffConstant = provableRelativeError ? 12 * exp((double) 1) * log(2 / 1.0e-9) : 0.07;
101  float kSecondsPerWalk = 4.0e-7; // The time required to generate a random walk. Can be tuned so that forward and reverse running times are equal, to improve running time
102  float WalkCountRMaxRatio = kChernoffConstant / (RelativeError * RelativeError) / MinProbability;
103  float ForwardSecondsRMaxRatio = kSecondsPerWalk * WalkCountRMaxRatio;
104 
105 
106  double startTime = WallClockTime();
107  // Results from ApproxContributionsBalanced are set by reference:
108  TIntFltH Estimates, Residuals;
109  float MaxResidual;
110  ApproxContributionsBalanced(Graph, JumpProb, TargetNId, ForwardSecondsRMaxRatio, Estimates, Residuals, MaxResidual);
111 
112  double reverseTime = WallClockTime() - startTime;
113  startTime = WallClockTime();
114 
115  double Estimate = 0.0;
116  // First incorporate the average Estimates value for starting nodes
117  for (int i = 0; i < StartNIdV.Len(); i++) {
118  Estimate += Estimates.GetDatWithDefault(StartNIdV[i], 0.0) / StartNIdV.Len();
119  }
120 
121  int RandomWalkCount = static_cast<int>(WalkCountRMaxRatio * MaxResidual);
122  TRnd Rnd(0); // 0 means seed from clock. We use an explicit Rnd for thread safety.
123  for (int i = 0; i < RandomWalkCount; i++) {
124  int vId = SamplePersonalizedPageRank(Graph, JumpProb, StartNIdV, Rnd);
125  Estimate += Residuals.GetDatWithDefault(vId, 0.0) / RandomWalkCount;
126  }
127  double forwardTime = WallClockTime() - startTime;
128  if (PrintTimeForTuning) printf("forwardTime reverseTime %g %g\n", forwardTime, reverseTime);
129 
130  return Estimate;
131 }
132 
133 // Convenience method that takes a single startNId and converts it to a vector of length 1 before calling GetPersonalizedPageRankBidirectional.
134 template <class PGraph>
135  double GetRndWalkRestartBidirectional(const PGraph& Graph,
136  double JumpProb,
137  int StartNId,
138  int TargetNId,
139  double minProbability = -1.0,
140  double relativeError = 0.1,
141  bool proveRelativeError = false,
142  bool PrintTimeForTuning = false) {
143  return GetPersonalizedPageRankBidirectional(Graph, JumpProb, TIntV::GetV(StartNId), TargetNId,
144  minProbability, relativeError, proveRelativeError, PrintTimeForTuning);
145  }
146 
147 }; // namespace TSnap
148 
149 #endif
void ApproxContributionsBalanced(const PGraph &Graph, double JumpProb, int TargetNId, float ForwardSecondsRMaxRatio, TIntFltH &ResultEstimates, TIntFltH &ResultResiduals, float &ResultMaxResidual)
Definition: randwalk.h:32
float GetPriority(const TVal &X)
Definition: priorityqueue.h:49
Definition: dt.h:11
void GetPriorities(THash< TVal, TFlt > &Result)
Definition: priorityqueue.h:83
double GetRndWalkRestartBidirectional(const PGraph &Graph, double JumpProb, int StartNId, int TargetNId, double minProbability=-1.0, double relativeError=0.1, bool proveRelativeError=false, bool PrintTimeForTuning=false)
Definition: randwalk.h:135
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TDat GetDatWithDefault(const TKey &Key, TDat DefaultValue)
Definition: hash.h:264
int SamplePersonalizedPageRank(const PGraph &Graph, double JumpProb, const TIntV &StartNIdV, TRnd &Rnd)
Definition: randwalk.h:67
const TVal & GetRndVal(TRnd &Rnd=TInt::Rnd) const
Returns a reference to a random element in the vector.
Definition: ds.h:589
void SetPriority(const TVal &X, float NewPriority)
Definition: priorityqueue.h:31
int GetOutDeg() const
Returns out-degree of the current node.
Definition: graph.h:402
void Insert(const TVal &X, float Priority)
Definition: priorityqueue.h:23
Node iterator. Only forward iteration (operator++) is supported.
Definition: graph.h:379
double GetUniDev()
Definition: dt.h:30
int GetUniDevInt(const int &Range=0)
Definition: dt.cpp:39
static TVec< TInt, TSizeTy > GetV(const TInt &Val1)
Returns a vector on element Val1.
Definition: ds.h:848
int GetInDeg() const
Returns in-degree of the current node.
Definition: graph.h:400
float GetMaxPriority()
Definition: priorityqueue.h:57
int GetInNId(const int &NodeN) const
Returns ID of NodeN-th in-node (the node pointing to the current node).
Definition: graph.h:408
double GetPersonalizedPageRankBidirectional(const PGraph &Graph, double JumpProb, const TIntV &StartNIdV, int TargetNId, double MinProbability=-1.0, double RelativeError=0.1, bool provableRelativeError=false, bool PrintTimeForTuning=false)
Definition: randwalk.h:88
int GetOutNId(const int &NodeN) const
Returns ID of NodeN-th out-node (the node the current node points to).
Definition: graph.h:412
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:430