SNAP Library 6.0, User Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
util.cpp
Go to the documentation of this file.
1 // Graph Utilities
3 void TGUtil::GetCdf(const TIntPrV& PdfV, TIntPrV& CdfV) {
4  CdfV = PdfV;
5  for (int i = 1; i < CdfV.Len(); i++) {
6  CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
7 }
8 
9 void TGUtil::GetCdf(const TFltPrV& PdfV, TFltPrV& CdfV) {
10  CdfV = PdfV;
11  for (int i = 1; i < CdfV.Len(); i++) {
12  CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
13 }
14 
15 void TGUtil::GetCdf(const TIntFltKdV& PdfV, TIntFltKdV& CdfV) {
16  CdfV = PdfV;
17  for (int i = 1; i < CdfV.Len(); i++) {
18  CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; }
19 }
20 
22  TIntPrV CdfV;
23  GetCdf(PdfV, CdfV);
24  return CdfV;
25 }
26 
28  TFltPrV CdfV;
29  GetCdf(PdfV, CdfV);
30  return CdfV;
31 }
32 
33 void TGUtil::GetCCdf(const TIntPrV& PdfV, TIntPrV& CCdfV) {
34  CCdfV = PdfV;
35  for (int i = CCdfV.Len()-2; i >= 0; i--) {
36  CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
37 }
38 
39 void TGUtil::GetCCdf(const TFltPrV& PdfV, TFltPrV& CCdfV) {
40  CCdfV = PdfV;
41  for (int i = CCdfV.Len()-2; i >= 0; i--) {
42  CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
43 }
44 
45 void TGUtil::GetCCdf(const TIntFltKdV& PdfV, TIntFltKdV& CCdfV) {
46  CCdfV = PdfV;
47  for (int i = CCdfV.Len()-2; i >= 0; i--) {
48  CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; }
49 }
50 
52  TIntPrV CCdfV;
53  GetCCdf(PdfV, CCdfV);
54  return CCdfV;
55 }
56 
58  TFltPrV CCdfV;
59  GetCCdf(PdfV, CCdfV);
60  return CCdfV;
61 }
62 
63 void TGUtil::GetPdf(const TIntPrV& CdfV, TIntPrV& PdfV) {
64  PdfV = CdfV;
65  for (int i = PdfV.Len()-1; i > 0; i--) {
66  PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
67 }
68 
69 void TGUtil::GetPdf(const TFltPrV& CdfV, TFltPrV& PdfV) {
70  PdfV = CdfV;
71  for (int i = PdfV.Len()-1; i > 0; i--) {
72  PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
73 }
74 
75 void TGUtil::GetPdf(const TIntFltKdV& CdfV, TIntFltKdV& PdfV) {
76  PdfV = CdfV;
77  for (int i = PdfV.Len()-1; i > 0; i--) {
78  PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; }
79 }
80 
82  double Sum = 0.0;
83  for (int i = 0; i < PdfV.Len(); i++) {
84  Sum += PdfV[i].Val2; }
85  if (Sum <= 0.0) { return; }
86  for (int i = 0; i < PdfV.Len(); i++) {
87  PdfV[i].Val2 /= Sum; }
88 }
89 
91  double Sum = 0.0;
92  for (int i = 0; i < PdfV.Len(); i++) {
93  Sum += PdfV[i].Dat; }
94  if (Sum <= 0.0) { return; }
95  for (int i = 0; i < PdfV.Len(); i++) {
96  PdfV[i].Dat /= Sum; }
97 }
98 
99 void TGUtil::MakeExpBins(const TFltPrV& XYValV, TFltPrV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
100  TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
101 }
102 
103 void TGUtil::MakeExpBins(const TFltKdV& XYValV, TFltKdV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
104  TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
105 }
106 
107 void TGUtil::MakeExpBins(const TFltV& YValV, TFltV& ExpYValV, const double& BinFactor) {
108  ExpYValV.Clr(true);
109  int prevI=0;
110  for (int i = 0; i < YValV.Len(); ) {
111  ExpYValV.Add(YValV[i]);
112  i = int(i*BinFactor);
113  if (i==prevI) { i++; }
114  prevI = i;
115  }
116 }
117 
118 void TGUtil::MakeExpBins(const TIntV& YValV, TIntV& ExpYValV, const double& BinFactor) {
119  ExpYValV.Clr(true);
120  int prevI=0;
121  for (int i = 0; i < YValV.Len(); ) {
122  ExpYValV.Add(YValV[i]);
123  i = int(i*BinFactor);
124  if (i==prevI) { i++; }
125  prevI = i;
126  }
127 }
128 
130 // String helper functions and utilities
131 // get <TagNm>TagVal</TagNm>
132 TChA& TStrUtil::GetXmlTagVal(TXmlLx& XmlLx, const TChA& TagNm) {
133  static TChA TagVal;
134  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
135  EAssertR(TagNm == XmlLx.TagNm.CStr(), TagNm);
136  const TXmlLxSym NextSym = XmlLx.GetSym();
137  TagVal = XmlLx.TxtChA;
138  if (NextSym == xsyStr) {
139  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
140  } else {
141  EAssertR(NextSym == xsyETag, TagNm); // empty tag
142  //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
143  }
144  EAssertR(XmlLx.TagNm == TagNm, TagNm);
145  return TagVal;
146 }
147 
148 // get <TagNm>TagVal</TagNm>
149 void TStrUtil::GetXmlTagNmVal(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal) {
150  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
151  TagNm = XmlLx.TagNm;
152  const TXmlLxSym NextSym = XmlLx.GetSym();
153  TagVal = XmlLx.TxtChA;
154  if (NextSym == xsyStr) {
155  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
156  } else {
157  EAssertR(NextSym == xsyETag, TagNm); // empty tag
158  //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
159  }
160 }
161 
162 // get <TagNm>*</TagNm> (can be many tags inbetween
163 bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) {
164  if (XmlLx.GetSym() != xsySTag) {
165  return false; }
166  TagVal.Clr();
167  TagNm = XmlLx.TagNm;
168  //const TXmlLxSym NextSym = XmlLx.GetSym();
169  while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
170  if (TakeTagNms) {
171  TagVal += XmlLx.TxtChA; }
172  else if (XmlLx.Sym == xsyStr) {
173  TagVal += XmlLx.TxtChA; }
174  XmlLx.GetSym();
175  }
176  return true;
177  //if (NextSym == xsyStr) {
178  // EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
179  //} else {
180  // EAssertR(NextSym == xsyETag, TagNm); // empty tag
181  // printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
182  //}
183 }
184 
185 
186 // http://www.ijs.si/fdfd/blah.html --> www.ijs.si
187 TChA TStrUtil::GetDomNm(const TChA& UrlChA) {
188  int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
189  if (EndSlash > 0) {
190  const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
191  if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); }
192  else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); }
193  } else {
194  if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); }
195  EndSlash = UrlChA.SearchCh('/', 0);
196  if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); }
197  else { return TChA(UrlChA).ToLc(); }
198  }
199 }
200 // get domain name and also strip starting www.
201 TChA TStrUtil::GetDomNm2(const TChA& UrlChA) {
202  TChA Dom = GetDomNm(UrlChA);
203  if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); }
204  else { return Dom; }
205 }
206 
207 int GetNthOccurence(const TChA& Url, const int& Count, const char Ch='/') {
208  const char *c = Url.CStr();
209  int cnt = 0;
210  while (*c && cnt != Count) {
211  if (*c == Ch) { cnt++; }
212  c++;
213  }
214  return int(c-Url.CStr()-1);
215 }
216 
217 // get website (GetDomNm2 or blog url)
218 TChA TStrUtil::GetWebsiteNm(const TChA& PostUrlStr) {
219  TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr);
220  // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539
221  if (DomNm == "blog.myspace.com") {
222  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1);
223  }
224  // For these websites take the domain name and 1st directory: http://blogs.msdn.com/squasta
225  // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx
226  // http://ameblo.jp/baptism/entry-10126216277.html
227  // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q
228  // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php
229  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
230  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
231  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
232  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
233  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
234  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
235  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
236  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
237  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
238  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
239  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
240  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
241  // http://blogs.zdnet.com/hardware/?p=2391
242  // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php
243  // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html
244  // http://blog.tv2.dk/ole.mork/entry254689.html
245  // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp
246  // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html
247  // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo
248  if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com"
249  || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co"
250  || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net"
251  || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com"
252  || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk"
253  || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") {
254  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1);
255  }
256  // http://digg.com/submit?phase=2&amp;url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&amp;title=and
257  // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show
258  if (DomNm == "digg.com") {
259  if (PostUrlStr.IsPrefix("http://digg.com/submit?")) {
260  const int Url = PostUrlStr.SearchStr(";url=");
261  if (Url != -1) {
262  return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); }
263  } else {
264  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); }
265  }
266  // For these websites take the domain name and 2 directories: http://bbc.co.uk/blogs/thereporters/
267  // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html
268  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
269  // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas
270  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
271  if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/")
272  || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) {
273  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
274  }
275  // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640
276  if (DomNm=="feeds.feedburner.com") {
277  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
278  }
279  // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c
280  if (DomNm=="groups.google.com") {
281  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
282  }
283  // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa
284  if (DomNm=="news.google.com") { // redirect
285  const int UrlPos = PostUrlStr.SearchStr("&url=");
286  if (UrlPos != -1) {
287  return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); }
288  }
289  // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de
290  if (DomNm == "bloggrevyen.no") { // redirect
291  const int Http2 = PostUrlStr.SearchStr("/http://");
292  if (Http2!=-1) {
293  return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); }
294  }
295  //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn
296  //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha
297  if (DomNm.IsSuffix(".rd.yahoo.com")) {
298  const int Http2 = PostUrlStr.SearchStr("/*");
299  if (Http2!=-1) {
300  return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); }
301  }
302  return DomNm;
303 }
304 
306 bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) {
307  UrlOut = UrlIn;
308  if (StripEnd(UrlIn, "/", UrlOut)) {}
309  else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
310  else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
311  else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
312  if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
313  // if UrlIn is relative url, try combine it with BaseUrl
314  if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
315  //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
316  return false; }
317  TChA Out;
318  if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; }
319  if (UrlIn[0] != '/') { Out.AddCh('/'); }
320  Out += UrlOut;
321  UrlOut = Out;
322  }
323  // http://www. --> http://
324  if (UrlOut.IsPrefix("http://www.")) {
325  UrlOut = TChA("http://") + UrlOut.GetSubStr(11, TInt::Mx);
326  }
327  UrlOut.ToLc();
328  return true;
329 }
330 
331 bool TStrUtil::StripEnd(const TChA& Str, const TChA& SearchStr, TChA& NewStr) {
332  const int StrLen = Str.Len();
333  const int SearchStrLen = SearchStr.Len();
334  if (StrLen < SearchStrLen) { return false; }
335  for (int i = 0; i < SearchStrLen; i++) {
336  if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { return false; }
337  }
338  NewStr = Str.GetSubStr(0, StrLen-SearchStrLen-1);
339  return true;
340 }
341 
342 TChA TStrUtil::GetShorStr(const TChA& LongStr, const int MaxLen) {
343  if (LongStr.Len() < MaxLen) { return LongStr; }
344  TChA Str = LongStr.GetSubStr(0, MaxLen-1);
345  Str += "...";
346  return Str;
347 }
348 
349 // space separated sequence of words, remove all punctuations, etc.
351  char *b = (char *) ChA.CStr();
352  while (*b && ! TCh::IsAlNum(*b)) { b++; }
353  if (*b == 0) { return TChA(); }
354  TChA OutChA(ChA.Len());
355  char *e = b, tmp;
356  while (*e) {
357  b = e;
358  while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; }
359  if (b < e) {
360  tmp = *e; *e=0;
361  OutChA += b; OutChA.AddCh(' ');
362  *e = tmp;
363  }
364  while (*e && ! TCh::IsAlNum(*e)) { e++; }
365  if (! *e) { break; }
366  }
367  OutChA.DelLastCh(); OutChA.ToLc();
368  return OutChA;
369 }
370 
371 // space seprated sequence of words (includes all non-blank characters, i.e., punctuations)
373  char *b = (char *) ChA.CStr();
374  while (*b && ! TCh::IsAlNum(*b)) { b++; }
375  if (*b == 0) { return TChA(); }
376  TChA OutChA(ChA.Len());
377  char *e = b;
378  bool ws=false;
379  while (*e) {
380  while (*e && TCh::IsWs(*e)) { e++; ws=true; }
381  if (! *e) { break; }
382  if (ws) { OutChA.AddCh(' '); ws=false; }
383  OutChA.AddCh(*e);
384  e++;
385  }
386  //OutChA.ToLc();
387  return OutChA;
388 }
389 int TStrUtil::CountWords(const TChA& ChA) {
390  return CountWords(ChA.CStr());
391 }
392 
393 int TStrUtil::CountWords(const char* CStr) {
394  int WrdCnt = 1;
395  for (const char *c = CStr; *c; c++) {
396  if (TCh::IsWs(*c)) { WrdCnt++; }
397  }
398  return WrdCnt;
399 }
400 
401 int TStrUtil::CountWords(const TChA& ChA, const TStrHash<TInt>& StopWordH) {
402  TChA Tmp;
403  TVec<char *> WrdV;
404  SplitWords(Tmp, WrdV);
405  int SWordCnt = 0;
406  for (int w = 0; w < WrdV.Len(); w++) {
407  if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; }
408  }
409  return WrdV.Len() - SWordCnt;
410 }
411 
412 int TStrUtil::SplitWords(TChA& ChA, TVec<char *>& WrdV, const bool& SplitOnWs) {
413  WrdV.Clr(false);
414  WrdV.Add(ChA.CStr());
415  for (char *c = (char *) ChA.CStr(); *c; c++) {
416  if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) {
417  *c = 0;
418  if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
419  WrdV.Add(c+1);
420  }
421  }
422  return WrdV.Len();
423 }
424 
425 int TStrUtil::SplitOnCh(TChA& ChA, TVec<char *>& WrdV, const char& Ch, const bool& SkipEmpty) {
426  WrdV.Clr(false);
427  WrdV.Add(ChA.CStr());
428  for (char *c = (char *) ChA.CStr(); *c; c++) {
429  if (*c == Ch) {
430  *c = 0;
431  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
432  WrdV.Add(c+1);
433  }
434  }
435  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
436  return WrdV.Len();
437 }
438 
439 int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) {
440  LineV.Clr(false);
441  LineV.Add(ChA.CStr());
442  bool IsChs=false;
443  for (char *c = (char *) ChA.CStr(); *c; c++) {
444  if (*c == '\n') {
445  if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n
446  *c=0;
447  if (SkipEmpty) {
448  if (IsChs) { LineV.Add(c+1); }
449  } else {
450  LineV.Add(c+1);
451  }
452  IsChs=false;
453  } else {
454  IsChs=true;
455  }
456  }
457  return LineV.Len();
458 }
459 
461  SentenceV.Clr();
462  const char *B = ChA.CStr();
463  const char *E = B+ChA.Len();
464  char *c = (char *) B;
465  while (*c && TCh::IsWs(*c)) { c++; }
466  if (*c) { SentenceV.Add(c); } else { return 0; }
467  for (; c < E; c++) {
468  if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence
469  if (c<E && *(c+1)=='"') { *c='"'; c++; } // blah." --> blah"
470  if (c>=E) { continue; }
471  *c=0; c++;
472  char *e = c-1;
473  while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars
474  while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum
475  if (c<E) { SentenceV.Add(c); }
476  }
477  }
478  return SentenceV.Len();
479 }
480 
481 void TStrUtil::RemoveHtmlTags(const TChA& HtmlStr, TChA& TextStr) {
482  TextStr.Clr();
483  char *StrB, *StrE;
484  // use full page html: skip till <body>
485  //PageHtmlStr = "<script fdsfs> fsdfsd </script> jure";
486  /*if (UseFullHtml) {
487  StrB = PageHtmlStr.CStr();
488  StrE = StrB+PageHtmlStr.Len();
489  char * NewB = strstr(StrB, "<body>");
490  if (NewB != NULL) { StrB = NewB+6; }
491  char * NewE = strstr(StrB, "body>");
492  if (NewE != NULL) {
493  while (true) {
494  char *E=strstr(NewE+4, "body>");
495  if (E == NULL) { break; } NewE = E; }
496  StrE = NewE;
497  }
498  } else { // only extracted post html*/
499  StrB = (char *) HtmlStr.CStr();
500  StrE = (char *) StrB+HtmlStr.Len(); //}
501  for (char *e = StrB; e < StrE; ) {
502  char* b = e;
503  while (e<StrE && *e != '<') { e++; }
504  // copy text
505  char tmp=*e; *e = 0;
506  TextStr+= b; TextStr.AddCh(' '); *e = tmp;
507  if (e >= StrE) { return; }
508  // if start of a comment: skip
509  if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment
510  e += 3;
511  while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; }
512  e++; continue;
513  }
514  // if "<script" then skip
515  if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') {
516  e += 5;
517  while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; }
518  e++; continue;
519  }
520  // skip to end of tag
521  while (e < StrE && *e != '>') { e++; }
522  if (e>=StrE) { return; }
523  e++;
524  }
525 }
526 
527 bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) {
528  int AlNumCnt=0, ChCnt=0;
529  for (const char *c = Str.CStr(); *c; c++) {
530  if (TCh::IsWs(*c)) { continue; }
531  if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
532  ChCnt++;
533  }
534  if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
535  return false;
536 }
537 
538 void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
539  const int NotWId = -1;
540  TChA ChA(CStr);
541  TVec<char *> WrdV;
542  TInt WId;
543  TStrUtil::SplitWords(ChA, WrdV);
544  WIdV.Clr(false);
545  for (int w = 0; w < WrdV.Len(); w++) {
546  if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
547  else { WIdV.Add(NotWId); }
548  }
549 }
550 
551 // and words to StrH and get a vector of word ids
552 void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
553  TChA ChA(CStr);
554  TVec<char *> WrdV;
555  TInt WId;
556  TStrUtil::SplitWords(ChA, WrdV);
557  WIdV.Clr(false);
558  for (int w = 0; w < WrdV.Len(); w++) {
559  WIdV.Add(StrH.AddDatId(WrdV[w]));
560  }
561 }
562 
563 // Parse time in various formats:
564 // 10:16, 16 Sep 2004
565 // 10:20, 2004 Sep 16
566 // 2005-07-07 20:30:35
567 // 23:24:07, 2005-07-10
568 // 9 July 2005 14:38
569 // 21:16, July 9, 2005
570 // 06:02, 10 July 2005
571 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) {
572  static TStrV MonthV1, MonthV2;
573  if (MonthV1.Empty()) {
574  TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1);
575  TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2);
576  }
577  TChA Tmp(TmStr);
578  Tmp.ToLc();
579  TVec<char *> WrdV;
580  const char* End = Tmp.CStr()+Tmp.Len();
581  int Col = -1, Cols=0;
582  for (char *b = Tmp.CStr(); b <End; ) {
583  WrdV.Add(b);
584  while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
585  if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++; }
586  *b=0; b++;
587  while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
588  }
589  if (Cols == 2) {
590  if (Col+1 >= WrdV.Len()) { return false; }
591  WrdV.Del(Col+1);
592  }
593  if (Col<1) { return false; }
594  const int Hr = atoi(WrdV[Col-1]);
595  const int Min = atoi(WrdV[Col]);
596  WrdV.Del(Col); WrdV.Del(Col-1);
597  if (WrdV.Len() != 3) { return false; }
598  int y=0,m=1,d=2, Mon=-1;
599  if (TCh::IsAlpha(WrdV[0][0])) {
600  y=2; m=0; d=1;
601  } else if (TCh::IsAlpha(WrdV[1][0])) {
602  y=2; m=1; d=0;
603  } else if (TCh::IsAlpha(WrdV[2][0])) {
604  y=0; m=2; d=1;
605  } else {
606  y=0; m=1; d=2;
607  Mon = atoi(WrdV[m]);
608  }
609  int Day = atoi(WrdV[d]);
610  if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; }
611  if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; }
612  if (Mon == 0) { return false; }
613  int Year = atoi(WrdV[y]);
614  if (Day > Year) { ::Swap(Day, Year); }
615  //printf("%d-%02d-%02d %02d:%02d\n", Year, Mon, Day, Hr, Min);
616  Tm = TSecTm(Year, Mon, Day, Hr, Min, 0);
617  return true;
618 }
619 
620 // Standardize first and lastnames into <last_name>_<first name innitial>
621 TStr TStrUtil::GetStdName(TStr AuthorName) {
622  TStr StdName;
623  AuthorName.ToLc();
624  AuthorName.ChangeChAll('\n', ' ');
625  AuthorName.ChangeChAll('.', ' ');
626  // if there is a number in the name, remove it and everything after it
627  int i, pos = 0;
628  while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
629  pos++; }
630  if (pos < AuthorName.Len()) {
631  AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); }
632  if (AuthorName.Empty()) { return TStr::GetNullStr(); }
633 
634  // replace everything after '('
635  int b = AuthorName.SearchCh('(');
636  if (b != -1) {
637  AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); }
638  // skip if contains ')'
639  if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); }
640  // skip if it is not a name
641  if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
642  || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
643  return TStr::GetNullStr();
644  }
645  // remove all non-letters (latex tags, ...)
646  TChA NewName;
647  for (i = 0; i < AuthorName.Len(); i++) {
648  const char Ch = AuthorName[i];
649  if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; }
650  }
651  StdName = NewName; StdName.ToTrunc();
652  TStrV AuthNmV; StdName.SplitOnWs(AuthNmV);
653  // too short -- not a name
654  if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
655  if (AuthNmV.Len() < 2) return TStr::GetNullStr();
656 
657  const TStr LastNm = AuthNmV.Last();
658  if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();
659 
660  IAssert(isalpha(AuthNmV[0][0]));
661  return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
662 }
663 
664 void TStrUtil::GetStdNameV(TStr AuthorNames, TStrV& StdNameV) {
665  AuthorNames.ChangeChAll('\n', ' ');
666  AuthorNames.ToLc();
667  // split into author names
668  TStrV AuthV, TmpV, Tmp2V;
669  // split on 'and'
670  AuthorNames.SplitOnStr(" and ", TmpV);
671  int i;
672  for (i = 0; i < TmpV.Len(); i++) {
673  TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); }
674  // split on '&'
675  TmpV = AuthV; AuthV.Clr();
676  for (i = 0; i < TmpV.Len(); i++) {
677  TmpV[i].SplitOnAllCh('&', Tmp2V); AuthV.AddV(Tmp2V); }
678  // split on ','
679  TmpV = AuthV; AuthV.Clr();
680  for (i = 0; i < TmpV.Len(); i++) {
681  TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); }
682  // split on ';'
683  TmpV = AuthV; AuthV.Clr();
684  for (i = 0; i < TmpV.Len(); i++) {
685  TmpV[i].SplitOnAllCh(';', Tmp2V); AuthV.AddV(Tmp2V); }
686  // standardize names
687  StdNameV.Clr();
688  //printf("\n*** %s\n", AuthorNames.CStr());
689  for (i = 0; i < AuthV.Len(); i++) {
690  TStr StdName = GetStdName(AuthV[i]);
691  if (! StdName.Empty()) {
692  //printf("\t%s ==> %s\n", AuthV[i].CStr(), StdName.CStr());
693  StdNameV.Add(StdName);
694  }
695  }
696 }
697 
700 
701 double TStopwatch::Tick() {
702 
703  //return clock() / ((double)CLOCKS_PER_SEC);
704 
705 #ifdef USE_OPENMP
706 
707  return omp_get_wtime();
708 
709 #else
710 
711 #ifdef GLib_WIN32
712 
713  return GetTickCount() / 1000.0;
714 
715 #else
716 
717  struct rusage rusage;
718 
719  getrusage(RUSAGE_SELF, &rusage);
720 
721 
722 
723  float cputime =
724 
725  ((float) (rusage.ru_utime.tv_usec + rusage.ru_stime.tv_usec) / 1000000) +
726 
727  ((float) (rusage.ru_utime.tv_sec + rusage.ru_stime.tv_sec));
728  return cputime;
729 #endif
730 #endif
731 }
732 
733 void TStopwatch::Start(const TExperiment Exp) {
734  Starts[Exp] = Tick();
735 }
736 
737 void TStopwatch::Stop(const TExperiment Exp) {
738  double Duration = Tick() - Starts[Exp];
739  Sums[Exp] += Duration;
740  Maxs[Exp] = Maxs[Exp] >= Duration ? Maxs[Exp] : Duration;
741  Mins[Exp] = Mins[Exp] <= Duration ? Mins[Exp] : Duration;
742  Cnts[Exp]++;
743 }
744 
745 int TStopwatch::Cnt(const TExperiment Exp) const {
746  return Cnts[Exp];
747 }
748 
749 double TStopwatch::Sum(const TExperiment Exp) const {
750  return Sums[Exp];
751 }
752 
753 double TStopwatch::Avg(const TExperiment Exp) const {
754  return Sums[Exp] / Cnts[Exp];
755 }
756 
757 double TStopwatch::Max(const TExperiment Exp) const {
758  return Maxs[Exp];
759 }
760 
761 double TStopwatch::Min(const TExperiment Exp) const {
762  return Mins[Exp];
763 }
764 
767 
768 #if defined(SW_WRITEN)
769 int WriteN(int fd, char *ptr, int nbytes) {
770  int nleft;
771  int nwritten;
772 
773  nleft = nbytes;
774  while (nleft > 0) {
775  nwritten = (int) write(fd, ptr, nleft);
776  if (nwritten <= 0) {
777  return nwritten;
778  }
779  nleft -= nwritten;
780  ptr += nwritten;
781  }
782  return (nbytes-nleft);
783 }
784 #endif
785 
TXmlLxSym GetSym()
Definition: xml.cpp:757
TStr TagNm
Definition: xml.h:141
TChA TxtChA
Definition: xml.h:140
static TChA GetDomNm(const TChA &UrlChA)
Definition: util.cpp:187
static bool GetNormalizedUrl(const TChA &UrlIn, const TChA &BaseUrl, TChA &UrlOut)
Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www.
Definition: util.cpp:306
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
Definition: gnuplot.cpp:614
static TChA GetWebsiteNm(const TChA &UrlChA)
Definition: util.cpp:218
bool Empty() const
Definition: dt.h:260
static const int Mx
Definition: dt.h:1142
static TChA GetDomNm2(const TChA &UrlChA)
Definition: util.cpp:201
void Clr()
Definition: dt.h:258
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
static int SplitSentences(TChA &ChA, TVec< char * > &SentenceV)
Definition: util.cpp:460
static void GetXmlTagNmVal(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal)
Definition: util.cpp:149
int Len() const
Definition: dt.h:259
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
Definition: util.cpp:99
int SearchStr(const TChA &Str, const int &BChN=0) const
Definition: dt.cpp:485
static void RemoveHtmlTags(const TChA &HtmlStr, TChA &TextStr)
Definition: util.cpp:481
static bool GetXmlTagNmVal2(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal, const bool &TakeTagNms)
Definition: util.cpp:163
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:570
static void GetPdf(const TIntPrV &CdfV, TIntPrV &PdfV)
Definition: util.cpp:63
static TChA GetShorStr(const TChA &LongStr, const int MaxLen=50)
Definition: util.cpp:342
int SearchChBack(const char &Ch, int BChN=-1) const
Definition: dt.cpp:477
static bool IsWs(const char &Ch)
Definition: dt.h:1063
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
static int SplitLines(TChA &ChA, TVec< char * > &LineV, const bool &SkipEmpty=false)
Definition: util.cpp:439
char * CStr()
Definition: dt.h:255
bool IsKey(const char *Key) const
Definition: hash.h:897
bool IsPrefix(const char *CStr, const int &BChN=0) const
Definition: dt.cpp:499
Definition: xml.h:98
static int CountWords(const char *CStr)
Definition: util.cpp:393
Definition: xml.h:93
static int SplitOnCh(TChA &ChA, TVec< char * > &WrdV, const char &Ch, const bool &SkipEmpty=false)
Definition: util.cpp:425
static int SplitWords(TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
Definition: util.cpp:412
static TChA GetCleanWrdStr(const TChA &ChA)
Definition: util.cpp:350
TChA GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:448
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
Definition: xml.h:93
static TChA & GetXmlTagVal(TXmlLx &XmlLx, const TChA &TagNm)
Definition: util.cpp:132
Definition: xml.h:93
static void GetCdf(const TIntPrV &PdfV, TIntPrV &CdfV)
Definition: util.cpp:3
static void GetCCdf(const TIntPrV &PdfV, TIntPrV &CCdfV)
Definition: util.cpp:33
TChA & ToLc()
Definition: dt.cpp:552
Definition: hash.h:781
Definition: dt.h:201
static bool IsAlNum(const char &Ch)
Definition: dt.h:1068
int SearchCh(const char &Ch, const int &BChN=0) const
Definition: dt.cpp:470
TXmlLxSym
Definition: xml.h:89
bool IsSuffix(const char *CStr) const
Definition: dt.cpp:518
static void Normalize(TFltPrV &PdfV)
Definition: util.cpp:81
#define EAssertR(Cond, MsgStr)
Definition: bd.h:283
int GetNthOccurence(const TChA &Url, const int &Count, const char Ch='/')
Definition: util.cpp:207
TXmlLxSym Sym
Definition: xml.h:139
char * CStr()
Definition: dt.h:479
static TChA GetCleanStr(const TChA &ChA)
Definition: util.cpp:372
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
static bool StripEnd(const TChA &Str, const TChA &SearchStr, TChA &NewStr)
Definition: util.cpp:331