SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
util.cpp
Go to the documentation of this file.
1 // Graph Utilities
3 void TGUtil::GetCdf(const TIntPrV& PdfV, TIntPrV& CdfV) {
4  CdfV = PdfV;
5  for (int i = 1; i < CdfV.Len(); i++) {
6  CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
7 }
8 
9 void TGUtil::GetCdf(const TFltPrV& PdfV, TFltPrV& CdfV) {
10  CdfV = PdfV;
11  for (int i = 1; i < CdfV.Len(); i++) {
12  CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
13 }
14 
15 void TGUtil::GetCdf(const TIntFltKdV& PdfV, TIntFltKdV& CdfV) {
16  CdfV = PdfV;
17  for (int i = 1; i < CdfV.Len(); i++) {
18  CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; }
19 }
20 
22  TIntPrV CdfV;
23  GetCdf(PdfV, CdfV);
24  return CdfV;
25 }
26 
28  TFltPrV CdfV;
29  GetCdf(PdfV, CdfV);
30  return CdfV;
31 }
32 
33 void TGUtil::GetCCdf(const TIntPrV& PdfV, TIntPrV& CCdfV) {
34  CCdfV = PdfV;
35  for (int i = CCdfV.Len()-2; i >= 0; i--) {
36  CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
37 }
38 
39 void TGUtil::GetCCdf(const TFltPrV& PdfV, TFltPrV& CCdfV) {
40  CCdfV = PdfV;
41  for (int i = CCdfV.Len()-2; i >= 0; i--) {
42  CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
43 }
44 
45 void TGUtil::GetCCdf(const TIntFltKdV& PdfV, TIntFltKdV& CCdfV) {
46  CCdfV = PdfV;
47  for (int i = CCdfV.Len()-2; i >= 0; i--) {
48  CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; }
49 }
50 
52  TIntPrV CCdfV;
53  GetCCdf(PdfV, CCdfV);
54  return CCdfV;
55 }
56 
58  TFltPrV CCdfV;
59  GetCCdf(PdfV, CCdfV);
60  return CCdfV;
61 }
62 
63 void TGUtil::GetPdf(const TIntPrV& CdfV, TIntPrV& PdfV) {
64  PdfV = CdfV;
65  for (int i = PdfV.Len()-1; i > 0; i--) {
66  PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
67 }
68 
69 void TGUtil::GetPdf(const TFltPrV& CdfV, TFltPrV& PdfV) {
70  PdfV = CdfV;
71  for (int i = PdfV.Len()-1; i > 0; i--) {
72  PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
73 }
74 
75 void TGUtil::GetPdf(const TIntFltKdV& CdfV, TIntFltKdV& PdfV) {
76  PdfV = CdfV;
77  for (int i = PdfV.Len()-1; i > 0; i--) {
78  PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; }
79 }
80 
82  double Sum = 0.0;
83  for (int i = 0; i < PdfV.Len(); i++) {
84  Sum += PdfV[i].Val2; }
85  if (Sum <= 0.0) { return; }
86  for (int i = 0; i < PdfV.Len(); i++) {
87  PdfV[i].Val2 /= Sum; }
88 }
89 
91  double Sum = 0.0;
92  for (int i = 0; i < PdfV.Len(); i++) {
93  Sum += PdfV[i].Dat; }
94  if (Sum <= 0.0) { return; }
95  for (int i = 0; i < PdfV.Len(); i++) {
96  PdfV[i].Dat /= Sum; }
97 }
98 
99 void TGUtil::MakeExpBins(const TFltPrV& XYValV, TFltPrV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
100  TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
101 }
102 
103 void TGUtil::MakeExpBins(const TFltKdV& XYValV, TFltKdV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
104  TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
105 }
106 
107 void TGUtil::MakeExpBins(const TFltV& YValV, TFltV& ExpYValV, const double& BinFactor) {
108  ExpYValV.Clr(true);
109  int prevI=0;
110  for (int i = 0; i < YValV.Len(); ) {
111  ExpYValV.Add(YValV[i]);
112  i = int(i*BinFactor);
113  if (i==prevI) { i++; }
114  prevI = i;
115  }
116 }
117 
118 void TGUtil::MakeExpBins(const TIntV& YValV, TIntV& ExpYValV, const double& BinFactor) {
119  ExpYValV.Clr(true);
120  int prevI=0;
121  for (int i = 0; i < YValV.Len(); ) {
122  ExpYValV.Add(YValV[i]);
123  i = int(i*BinFactor);
124  if (i==prevI) { i++; }
125  prevI = i;
126  }
127 }
128 
130 // String helper functions and utilities
131 // get <TagNm>TagVal</TagNm>
132 TChA& TStrUtil::GetXmlTagVal(TXmlLx& XmlLx, const TChA& TagNm) {
133  static TChA TagVal;
134  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
135  EAssertR(TagNm == XmlLx.TagNm.CStr(), TagNm);
136  const TXmlLxSym NextSym = XmlLx.GetSym();
137  TagVal = XmlLx.TxtChA;
138  if (NextSym == xsyStr) {
139  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
140  } else {
141  EAssertR(NextSym == xsyETag, TagNm); // empty tag
142  //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
143  }
144  EAssertR(XmlLx.TagNm == TagNm, TagNm);
145  return TagVal;
146 }
147 
148 // get <TagNm>TagVal</TagNm>
149 void TStrUtil::GetXmlTagNmVal(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal) {
150  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
151  TagNm = XmlLx.TagNm;
152  const TXmlLxSym NextSym = XmlLx.GetSym();
153  TagVal = XmlLx.TxtChA;
154  if (NextSym == xsyStr) {
155  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
156  } else {
157  EAssertR(NextSym == xsyETag, TagNm); // empty tag
158  //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
159  }
160 }
161 
162 // get <TagNm>*</TagNm> (can be many tags inbetween
163 bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) {
164  if (XmlLx.GetSym() != xsySTag) {
165  return false; }
166  TagVal.Clr();
167  TagNm = XmlLx.TagNm;
168  //const TXmlLxSym NextSym = XmlLx.GetSym();
169  while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
170  if (TakeTagNms) {
171  TagVal += XmlLx.TxtChA; }
172  else if (XmlLx.Sym == xsyStr) {
173  TagVal += XmlLx.TxtChA; }
174  XmlLx.GetSym();
175  }
176  return true;
177  //if (NextSym == xsyStr) {
178  // EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
179  //} else {
180  // EAssertR(NextSym == xsyETag, TagNm); // empty tag
181  // printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
182  //}
183 }
184 
185 
186 // http://www.ijs.si/fdfd/blah.html --> www.ijs.si
187 TChA TStrUtil::GetDomNm(const TChA& UrlChA) {
188  int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
189  if (EndSlash > 0) {
190  const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
191  if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); }
192  else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); }
193  } else {
194  if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); }
195  EndSlash = UrlChA.SearchCh('/', 0);
196  if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); }
197  else { return TChA(UrlChA).ToLc(); }
198  }
199 }
200 // get domain name and also strip starting www.
201 TChA TStrUtil::GetDomNm2(const TChA& UrlChA) {
202  TChA Dom = GetDomNm(UrlChA);
203  if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); }
204  else { return Dom; }
205 }
206 
207 int GetNthOccurence(const TChA& Url, const int& Count, const char Ch='/') {
208  const char *c = Url.CStr();
209  int cnt = 0;
210  while (*c && cnt != Count) {
211  if (*c == Ch) { cnt++; }
212  c++;
213  }
214  return int(c-Url.CStr()-1);
215 }
216 
217 // get website (GetDomNm2 or blog url)
218 TChA TStrUtil::GetWebsiteNm(const TChA& PostUrlStr) {
219  TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr);
220  // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539
221  if (DomNm == "blog.myspace.com") {
222  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1);
223  }
224  // For these websites take the domain name and 1st directory: http://blogs.msdn.com/squasta
225  // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx
226  // http://ameblo.jp/baptism/entry-10126216277.html
227  // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q
228  // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php
229  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
230  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
231  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
232  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
233  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
234  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
235  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
236  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
237  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
238  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
239  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
240  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
241  // http://blogs.zdnet.com/hardware/?p=2391
242  // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php
243  // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html
244  // http://blog.tv2.dk/ole.mork/entry254689.html
245  // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp
246  // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html
247  // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo
248  if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com"
249  || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co"
250  || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net"
251  || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com"
252  || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk"
253  || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") {
254  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1);
255  }
256  // http://digg.com/submit?phase=2&amp;url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&amp;title=and
257  // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show
258  if (DomNm == "digg.com") {
259  if (PostUrlStr.IsPrefix("http://digg.com/submit?")) {
260  const int Url = PostUrlStr.SearchStr(";url=");
261  if (Url != -1) {
262  return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); }
263  } else {
264  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); }
265  }
266  // For these websites take the domain name and 2 directories: http://bbc.co.uk/blogs/thereporters/
267  // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html
268  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
269  // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas
270  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
271  if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/")
272  || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) {
273  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
274  }
275  // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640
276  if (DomNm=="feeds.feedburner.com") {
277  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
278  }
279  // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c
280  if (DomNm=="groups.google.com") {
281  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
282  }
283  // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa
284  if (DomNm=="news.google.com") { // redirect
285  const int UrlPos = PostUrlStr.SearchStr("&url=");
286  if (UrlPos != -1) {
287  return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); }
288  }
289  // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de
290  if (DomNm == "bloggrevyen.no") { // redirect
291  const int Http2 = PostUrlStr.SearchStr("/http://");
292  if (Http2!=-1) {
293  return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); }
294  }
295  //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn
296  //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha
297  if (DomNm.IsSuffix(".rd.yahoo.com")) {
298  const int Http2 = PostUrlStr.SearchStr("/*");
299  if (Http2!=-1) {
300  return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); }
301  }
302  return DomNm;
303 }
304 
306 bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) {
307  UrlOut = UrlIn;
308  if (StripEnd(UrlIn, "/", UrlOut)) {}
309  else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
310  else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
311  else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
312  if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
313  // if UrlIn is relative url, try combine it with BaseUrl
314  if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
315  //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
316  return false; }
317  TChA Out;
318  if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; }
319  if (UrlIn[0] != '/') { Out.AddCh('/'); }
320  Out += UrlOut;
321  UrlOut = Out;
322  }
323  // http://www. --> http://
324  if (UrlOut.IsPrefix("http://www.")) {
325  UrlOut = TChA("http://") + UrlOut.GetSubStr(11, TInt::Mx);
326  }
327  UrlOut.ToLc();
328  return true;
329 }
330 
331 bool TStrUtil::StripEnd(const TChA& Str, const TChA& SearchStr, TChA& NewStr) {
332  const int StrLen = Str.Len();
333  const int SearchStrLen = SearchStr.Len();
334  if (StrLen < SearchStrLen) { return false; }
335  for (int i = 0; i < SearchStrLen; i++) {
336  if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { return false; }
337  }
338  NewStr = Str.GetSubStr(0, StrLen-SearchStrLen-1);
339  return true;
340 }
341 
342 TChA TStrUtil::GetShorStr(const TChA& LongStr, const int MaxLen) {
343  if (LongStr.Len() < MaxLen) { return LongStr; }
344  TChA Str = LongStr.GetSubStr(0, MaxLen-1);
345  Str += "...";
346  return Str;
347 }
348 
349 // space separated sequence of words, remove all punctuations, etc.
351  char *b = (char *) ChA.CStr();
352  while (*b && ! TCh::IsAlNum(*b)) { b++; }
353  if (*b == 0) { return TChA(); }
354  TChA OutChA(ChA.Len());
355  char *e = b, tmp;
356  while (*e) {
357  b = e;
358  while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; }
359  if (b < e) {
360  tmp = *e; *e=0;
361  OutChA += b; OutChA.AddCh(' ');
362  *e = tmp;
363  }
364  while (*e && ! TCh::IsAlNum(*e)) { e++; }
365  if (! *e) { break; }
366  }
367  OutChA.DelLastCh(); OutChA.ToLc();
368  return OutChA;
369 }
370 
371 // space seprated sequence of words (includes all non-blank characters, i.e., punctuations)
373  char *b = (char *) ChA.CStr();
374  while (*b && ! TCh::IsAlNum(*b)) { b++; }
375  if (*b == 0) { return TChA(); }
376  TChA OutChA(ChA.Len());
377  char *e = b;
378  bool ws=false;
379  while (*e) {
380  while (*e && TCh::IsWs(*e)) { e++; ws=true; }
381  if (! *e) { break; }
382  if (ws) { OutChA.AddCh(' '); ws=false; }
383  OutChA.AddCh(*e);
384  e++;
385  }
386  //OutChA.ToLc();
387  return OutChA;
388 }
389 int TStrUtil::CountWords(const TChA& ChA) {
390  return CountWords(ChA.CStr());
391 }
392 
393 int TStrUtil::CountWords(const char* CStr) {
394  int WrdCnt = 1;
395  for (const char *c = CStr; *c; c++) {
396  if (TCh::IsWs(*c)) { WrdCnt++; }
397  }
398  return WrdCnt;
399 }
400 
401 int TStrUtil::CountWords(const TChA& ChA, const TStrHash<TInt>& StopWordH) {
402  TChA Tmp;
403  TVec<char *> WrdV;
404  SplitWords(Tmp, WrdV);
405  int SWordCnt = 0;
406  for (int w = 0; w < WrdV.Len(); w++) {
407  if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; }
408  }
409  return WrdV.Len() - SWordCnt;
410 }
411 
412 int TStrUtil::SplitWords(TChA& ChA, TVec<char *>& WrdV, const bool& SplitOnWs) {
413  WrdV.Clr(false);
414  WrdV.Add(ChA.CStr());
415  for (char *c = (char *) ChA.CStr(); *c; c++) {
416  if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) {
417  *c = 0;
418  if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
419  WrdV.Add(c+1);
420  }
421  }
422  return WrdV.Len();
423 }
424 
425 int TStrUtil::SplitOnCh(TChA& ChA, TVec<char *>& WrdV, const char& Ch, const bool& SkipEmpty) {
426  WrdV.Clr(false);
427  WrdV.Add(ChA.CStr());
428  for (char *c = (char *) ChA.CStr(); *c; c++) {
429  if (*c == Ch) {
430  *c = 0;
431  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
432  WrdV.Add(c+1);
433  }
434  }
435  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
436  return WrdV.Len();
437 }
438 
439 int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) {
440  LineV.Clr(false);
441  LineV.Add(ChA.CStr());
442  bool IsChs=false;
443  for (char *c = (char *) ChA.CStr(); *c; c++) {
444  if (*c == '\n') {
445  if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n
446  *c=0;
447  if (SkipEmpty) {
448  if (IsChs) { LineV.Add(c+1); }
449  } else {
450  LineV.Add(c+1);
451  }
452  IsChs=false;
453  } else {
454  IsChs=true;
455  }
456  }
457  return LineV.Len();
458 }
459 
461  SentenceV.Clr();
462  const char *B = ChA.CStr();
463  const char *E = B+ChA.Len();
464  char *c = (char *) B;
465  while (*c && TCh::IsWs(*c)) { c++; }
466  if (*c) { SentenceV.Add(c); } else { return 0; }
467  for (; c < E; c++) {
468  if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence
469  if (c<E && *(c+1)=='"') { *c='"'; c++; } // blah." --> blah"
470  if (c>=E) { continue; }
471  *c=0; c++;
472  char *e = c-1;
473  while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars
474  while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum
475  if (c<E) { SentenceV.Add(c); }
476  }
477  }
478  return SentenceV.Len();
479 }
480 
481 void TStrUtil::RemoveHtmlTags(const TChA& HtmlStr, TChA& TextStr) {
482  TextStr.Clr();
483  char *StrB, *StrE;
484  // use full page html: skip till <body>
485  //PageHtmlStr = "<script fdsfs> fsdfsd </script> jure";
486  /*if (UseFullHtml) {
487  StrB = PageHtmlStr.CStr();
488  StrE = StrB+PageHtmlStr.Len();
489  char * NewB = strstr(StrB, "<body>");
490  if (NewB != NULL) { StrB = NewB+6; }
491  char * NewE = strstr(StrB, "body>");
492  if (NewE != NULL) {
493  while (true) {
494  char *E=strstr(NewE+4, "body>");
495  if (E == NULL) { break; } NewE = E; }
496  StrE = NewE;
497  }
498  } else { // only extracted post html*/
499  StrB = (char *) HtmlStr.CStr();
500  StrE = (char *) StrB+HtmlStr.Len(); //}
501  for (char *e = StrB; e < StrE; ) {
502  char* b = e;
503  while (e<StrE && *e != '<') { e++; }
504  // copy text
505  char tmp=*e; *e = 0;
506  TextStr+= b; TextStr.AddCh(' '); *e = tmp;
507  if (e >= StrE) { return; }
508  // if start of a comment: skip
509  if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment
510  e += 3;
511  while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; }
512  e++; continue;
513  }
514  // if "<script" then skip
515  if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') {
516  e += 5;
517  while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; }
518  e++; continue;
519  }
520  // skip to end of tag
521  while (e < StrE && *e != '>') { e++; }
522  if (e>=StrE) { return; }
523  e++;
524  }
525 }
526 
527 bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) {
528  int AlNumCnt=0, ChCnt=0;
529  for (const char *c = Str.CStr(); *c; c++) {
530  if (TCh::IsWs(*c)) { continue; }
531  if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
532  ChCnt++;
533  }
534  if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
535  return false;
536 }
537 
538 void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
539  const int NotWId = -1;
540  TChA ChA(CStr);
541  TVec<char *> WrdV;
542  TInt WId;
543  TStrUtil::SplitWords(ChA, WrdV);
544  WIdV.Clr(false);
545  for (int w = 0; w < WrdV.Len(); w++) {
546  if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
547  else { WIdV.Add(NotWId); }
548  }
549 }
550 
551 // and words to StrH and get a vector of word ids
552 void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
553  TChA ChA(CStr);
554  TVec<char *> WrdV;
555  TInt WId;
556  TStrUtil::SplitWords(ChA, WrdV);
557  WIdV.Clr(false);
558  for (int w = 0; w < WrdV.Len(); w++) {
559  WIdV.Add(StrH.AddDatId(WrdV[w]));
560  }
561 }
562 
563 // Parse time in various formats:
564 // 10:16, 16 Sep 2004
565 // 10:20, 2004 Sep 16
566 // 2005-07-07 20:30:35
567 // 23:24:07, 2005-07-10
568 // 9 July 2005 14:38
569 // 21:16, July 9, 2005
570 // 06:02, 10 July 2005
571 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) {
572  static TStrV MonthV1, MonthV2;
573  if (MonthV1.Empty()) {
574  TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1);
575  TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2);
576  }
577  TChA Tmp(TmStr);
578  Tmp.ToLc();
579  TVec<char *> WrdV;
580  const char* End = Tmp.CStr()+Tmp.Len();
581  int Col = -1, Cols=0;
582  for (char *b = Tmp.CStr(); b <End; ) {
583  WrdV.Add(b);
584  while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
585  if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++; }
586  *b=0; b++;
587  while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
588  }
589  if (Cols == 2) {
590  if (Col+1 >= WrdV.Len()) { return false; }
591  WrdV.Del(Col+1);
592  }
593  if (Col<1) { return false; }
594  const int Hr = atoi(WrdV[Col-1]);
595  const int Min = atoi(WrdV[Col]);
596  WrdV.Del(Col); WrdV.Del(Col-1);
597  if (WrdV.Len() != 3) { return false; }
598  int y=0,m=1,d=2, Mon=-1;
599  if (TCh::IsAlpha(WrdV[0][0])) {
600  y=2; m=0; d=1;
601  } else if (TCh::IsAlpha(WrdV[1][0])) {
602  y=2; m=1; d=0;
603  } else if (TCh::IsAlpha(WrdV[2][0])) {
604  y=0; m=2; d=1;
605  } else {
606  y=0; m=1; d=2;
607  Mon = atoi(WrdV[m]);
608  }
609  int Day = atoi(WrdV[d]);
610  if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; }
611  if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; }
612  if (Mon == 0) { return false; }
613  int Year = atoi(WrdV[y]);
614  if (Day > Year) { ::Swap(Day, Year); }
615  //printf("%d-%02d-%02d %02d:%02d\n", Year, Mon, Day, Hr, Min);
616  Tm = TSecTm(Year, Mon, Day, Hr, Min, 0);
617  return true;
618 }
619 
620 // Standardize first and lastnames into <last_name>_<first name innitial>
621 TStr TStrUtil::GetStdName(TStr AuthorName) {
622  TStr StdName;
623  AuthorName.ToLc();
624  AuthorName.ChangeChAll('\n', ' ');
625  AuthorName.ChangeChAll('.', ' ');
626  // if there is a number in the name, remove it and everything after it
627  int i, pos = 0;
628  while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
629  pos++; }
630  if (pos < AuthorName.Len()) {
631  AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); }
632  if (AuthorName.Empty()) { return TStr::GetNullStr(); }
633 
634  // replace everything after '('
635  int b = AuthorName.SearchCh('(');
636  if (b != -1) {
637  AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); }
638  // skip if contains ')'
639  if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); }
640  // skip if it is not a name
641  if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
642  || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
643  return TStr::GetNullStr();
644  }
645  // remove all non-letters (latex tags, ...)
646  TChA NewName;
647  for (i = 0; i < AuthorName.Len(); i++) {
648  const char Ch = AuthorName[i];
649  if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; }
650  }
651  StdName = NewName; StdName.ToTrunc();
652  TStrV AuthNmV; StdName.SplitOnWs(AuthNmV);
653  // too short -- not a name
654  if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
655  if (AuthNmV.Len() < 2) return TStr::GetNullStr();
656 
657  const TStr LastNm = AuthNmV.Last();
658  if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();
659 
660  IAssert(isalpha(AuthNmV[0][0]));
661  return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
662 }
663 
664 void TStrUtil::GetStdNameV(TStr AuthorNames, TStrV& StdNameV) {
665  AuthorNames.ChangeChAll('\n', ' ');
666  AuthorNames.ToLc();
667  // split into author names
668  TStrV AuthV, TmpV, Tmp2V;
669  // split on 'and'
670  AuthorNames.SplitOnStr(" and ", TmpV);
671  int i;
672  for (i = 0; i < TmpV.Len(); i++) {
673  TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); }
674  // split on '&'
675  TmpV = AuthV; AuthV.Clr();
676  for (i = 0; i < TmpV.Len(); i++) {
677  TmpV[i].SplitOnAllCh('&', Tmp2V); AuthV.AddV(Tmp2V); }
678  // split on ','
679  TmpV = AuthV; AuthV.Clr();
680  for (i = 0; i < TmpV.Len(); i++) {
681  TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); }
682  // split on ';'
683  TmpV = AuthV; AuthV.Clr();
684  for (i = 0; i < TmpV.Len(); i++) {
685  TmpV[i].SplitOnAllCh(';', Tmp2V); AuthV.AddV(Tmp2V); }
686  // standardize names
687  StdNameV.Clr();
688  //printf("\n*** %s\n", AuthorNames.CStr());
689  for (i = 0; i < AuthV.Len(); i++) {
690  TStr StdName = GetStdName(AuthV[i]);
691  if (! StdName.Empty()) {
692  //printf("\t%s ==> %s\n", AuthV[i].CStr(), StdName.CStr());
693  StdNameV.Add(StdName);
694  }
695  }
696 }
697 
700 
701 double TStopwatch::Tick() {
702 
703  //return clock() / ((double)CLOCKS_PER_SEC);
704 
705 #ifdef USE_OPENMP
706 
707  return omp_get_wtime();
708 
709 #else
710 
711 #ifdef GLib_WIN32
712 
713  return GetTickCount() / 1000.0;
714 
715 #else
716 
717  struct rusage rusage;
718 
719  getrusage(RUSAGE_SELF, &rusage);
720 
721 
722 
723  float cputime =
724 
725  ((float) (rusage.ru_utime.tv_usec + rusage.ru_stime.tv_usec) / 1000000) +
726 
727  ((float) (rusage.ru_utime.tv_sec + rusage.ru_stime.tv_sec));
728  return cputime;
729 #endif
730 #endif
731 }
732 
733 void TStopwatch::Start(const TExperiment Exp) {
734  Starts[Exp] = Tick();
735 }
736 
737 void TStopwatch::Stop(const TExperiment Exp) {
738  double Duration = Tick() - Starts[Exp];
739  Sums[Exp] += Duration;
740  Maxs[Exp] = Maxs[Exp] >= Duration ? Maxs[Exp] : Duration;
741  Mins[Exp] = Mins[Exp] <= Duration ? Mins[Exp] : Duration;
742  Cnts[Exp]++;
743 }
744 
745 int TStopwatch::Cnt(const TExperiment Exp) const {
746  return Cnts[Exp];
747 }
748 
749 double TStopwatch::Sum(const TExperiment Exp) const {
750  return Sums[Exp];
751 }
752 
753 double TStopwatch::Avg(const TExperiment Exp) const {
754  return Sums[Exp] / Cnts[Exp];
755 }
756 
757 double TStopwatch::Max(const TExperiment Exp) const {
758  return Maxs[Exp];
759 }
760 
761 double TStopwatch::Min(const TExperiment Exp) const {
762  return Mins[Exp];
763 }
764 
767 
768 #if defined(SW_WRITEN)
769 int WriteN(int fd, char *ptr, int nbytes) {
770  int nleft;
771  int nwritten;
772 
773  nleft = nbytes;
774  while (nleft > 0) {
775  nwritten = (int) write(fd, ptr, nleft);
776  if (nwritten <= 0) {
777  return nwritten;
778  }
779  nleft -= nwritten;
780  ptr += nwritten;
781  }
782  return (nbytes-nleft);
783 }
784 #endif
785 
TXmlLxSym GetSym()
Definition: xml.cpp:757
TStr TagNm
Definition: xml.h:141
TChA TxtChA
Definition: xml.h:140
static TChA GetDomNm(const TChA &UrlChA)
Definition: util.cpp:187
static bool GetNormalizedUrl(const TChA &UrlIn, const TChA &BaseUrl, TChA &UrlOut)
Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www.
Definition: util.cpp:306
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
Definition: gnuplot.cpp:614
static TChA GetWebsiteNm(const TChA &UrlChA)
Definition: util.cpp:218
bool Empty() const
Definition: dt.h:260
static const int Mx
Definition: dt.h:1049
static TChA GetDomNm2(const TChA &UrlChA)
Definition: util.cpp:201
void Clr()
Definition: dt.h:258
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
static int SplitSentences(TChA &ChA, TVec< char * > &SentenceV)
Definition: util.cpp:460
static void GetXmlTagNmVal(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal)
Definition: util.cpp:149
int Len() const
Definition: dt.h:259
static void MakeExpBins(const TFltPrV &XYValV, TFltPrV &ExpXYValV, const double &BinFactor=2, const double &MinYVal=1)
Definition: util.cpp:99
int SearchStr(const TChA &Str, const int &BChN=0) const
Definition: dt.cpp:485
static void RemoveHtmlTags(const TChA &HtmlStr, TChA &TextStr)
Definition: util.cpp:481
static bool GetXmlTagNmVal2(TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal, const bool &TakeTagNms)
Definition: util.cpp:163
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:542
static void GetPdf(const TIntPrV &CdfV, TIntPrV &PdfV)
Definition: util.cpp:63
static TChA GetShorStr(const TChA &LongStr, const int MaxLen=50)
Definition: util.cpp:342
int SearchChBack(const char &Ch, int BChN=-1) const
Definition: dt.cpp:477
static bool IsWs(const char &Ch)
Definition: dt.h:970
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
static int SplitLines(TChA &ChA, TVec< char * > &LineV, const bool &SkipEmpty=false)
Definition: util.cpp:439
char * CStr()
Definition: dt.h:255
bool IsKey(const char *Key) const
Definition: hash.h:825
bool IsPrefix(const char *CStr, const int &BChN=0) const
Definition: dt.cpp:499
Definition: xml.h:98
static int CountWords(const char *CStr)
Definition: util.cpp:393
Definition: xml.h:93
static int SplitOnCh(TChA &ChA, TVec< char * > &WrdV, const char &Ch, const bool &SkipEmpty=false)
Definition: util.cpp:425
static int SplitWords(TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
Definition: util.cpp:412
static TChA GetCleanWrdStr(const TChA &ChA)
Definition: util.cpp:350
TChA GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:448
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:551
Definition: xml.h:93
static TChA & GetXmlTagVal(TXmlLx &XmlLx, const TChA &TagNm)
Definition: util.cpp:132
Definition: xml.h:93
static void GetCdf(const TIntPrV &PdfV, TIntPrV &CdfV)
Definition: util.cpp:3
static void GetCCdf(const TIntPrV &PdfV, TIntPrV &CCdfV)
Definition: util.cpp:33
TChA & ToLc()
Definition: dt.cpp:552
Definition: hash.h:729
Definition: dt.h:201
static bool IsAlNum(const char &Ch)
Definition: dt.h:975
int SearchCh(const char &Ch, const int &BChN=0) const
Definition: dt.cpp:470
TXmlLxSym
Definition: xml.h:89
bool IsSuffix(const char *CStr) const
Definition: dt.cpp:518
static void Normalize(TFltPrV &PdfV)
Definition: util.cpp:81
#define EAssertR(Cond, MsgStr)
Definition: bd.h:283
int GetNthOccurence(const TChA &Url, const int &Count, const char Ch='/')
Definition: util.cpp:207
TXmlLxSym Sym
Definition: xml.h:139
char * CStr()
Definition: dt.h:476
static TChA GetCleanStr(const TChA &ChA)
Definition: util.cpp:372
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void DelLast()
Removes the last element of the vector.
Definition: ds.h:635
static bool StripEnd(const TChA &Str, const TChA &SearchStr, TChA &NewStr)
Definition: util.cpp:331