SNAP Library, Developer Reference  2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
util.cpp
Go to the documentation of this file.
00001 
00002 // Graph Utilities
00003 void TGUtil::GetCdf(const TIntPrV& PdfV, TIntPrV& CdfV) {
00004   CdfV = PdfV;
00005   for (int i = 1; i < CdfV.Len(); i++) {
00006     CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
00007 }
00008 
00009 void TGUtil::GetCdf(const TFltPrV& PdfV, TFltPrV& CdfV) {
00010   CdfV = PdfV;
00011   for (int i = 1; i < CdfV.Len(); i++) {
00012     CdfV[i].Val2 = CdfV[i-1].Val2 + CdfV[i].Val2; }
00013 }
00014 
00015 void TGUtil::GetCdf(const TIntFltKdV& PdfV, TIntFltKdV& CdfV) {
00016   CdfV = PdfV;
00017   for (int i = 1; i < CdfV.Len(); i++) {
00018     CdfV[i].Dat = CdfV[i-1].Dat + CdfV[i].Dat; }
00019 }
00020 
00021 TIntPrV TGUtil::GetCdf(const TIntPrV& PdfV) {
00022   TIntPrV CdfV;
00023   GetCdf(PdfV, CdfV);
00024   return CdfV;
00025 }
00026 
00027 TFltPrV TGUtil::GetCdf(const TFltPrV& PdfV) {
00028   TFltPrV CdfV;
00029   GetCdf(PdfV, CdfV);
00030   return CdfV;
00031 }
00032 
00033 void TGUtil::GetCCdf(const TIntPrV& PdfV, TIntPrV& CCdfV) {
00034   CCdfV = PdfV;
00035   for (int i = CCdfV.Len()-2; i >= 0; i--) {
00036     CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
00037 }
00038 
00039 void TGUtil::GetCCdf(const TFltPrV& PdfV, TFltPrV& CCdfV) {
00040   CCdfV = PdfV;
00041   for (int i = CCdfV.Len()-2; i >= 0; i--) {
00042     CCdfV[i].Val2 = CCdfV[i+1].Val2 + CCdfV[i].Val2; }
00043 }
00044 
00045 void TGUtil::GetCCdf(const TIntFltKdV& PdfV, TIntFltKdV& CCdfV) {
00046   CCdfV = PdfV;
00047   for (int i = CCdfV.Len()-2; i >= 0; i--) {
00048     CCdfV[i].Dat = CCdfV[i+1].Dat + CCdfV[i].Dat; }
00049 }
00050 
00051 TIntPrV TGUtil::GetCCdf(const TIntPrV& PdfV) {
00052   TIntPrV CCdfV;
00053   GetCCdf(PdfV, CCdfV);
00054   return CCdfV;
00055 }
00056 
00057 TFltPrV TGUtil::GetCCdf(const TFltPrV& PdfV) {
00058   TFltPrV CCdfV;
00059   GetCCdf(PdfV, CCdfV);
00060   return CCdfV;
00061 }
00062 
00063 void TGUtil::GetPdf(const TIntPrV& CdfV, TIntPrV& PdfV) {
00064   PdfV = CdfV;
00065   for (int i = PdfV.Len()-1; i > 0; i--) {
00066     PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
00067 }
00068 
00069 void TGUtil::GetPdf(const TFltPrV& CdfV, TFltPrV& PdfV) {
00070   PdfV = CdfV;
00071   for (int i = PdfV.Len()-1; i > 0; i--) {
00072     PdfV[i].Val2 = PdfV[i].Val2 - PdfV[i-1].Val2; }
00073 }
00074 
00075 void TGUtil::GetPdf(const TIntFltKdV& CdfV, TIntFltKdV& PdfV) {
00076   PdfV = CdfV;
00077   for (int i = PdfV.Len()-1; i > 0; i--) {
00078     PdfV[i].Dat = PdfV[i].Dat - PdfV[i-1].Dat; }
00079 }
00080 
00081 void TGUtil::Normalize(TFltPrV& PdfV) {
00082   double Sum = 0.0;
00083   for (int i = 0; i < PdfV.Len(); i++) {
00084     Sum += PdfV[i].Val2; }
00085   if (Sum <= 0.0) { return; }
00086   for (int i = 0; i < PdfV.Len(); i++) {
00087     PdfV[i].Val2 /= Sum; }
00088 }
00089 
00090 void TGUtil::Normalize(TIntFltKdV& PdfV) {
00091   double Sum = 0.0;
00092   for (int i = 0; i < PdfV.Len(); i++) {
00093     Sum += PdfV[i].Dat; }
00094   if (Sum <= 0.0) { return; }
00095   for (int i = 0; i < PdfV.Len(); i++) {
00096     PdfV[i].Dat /= Sum; }
00097 }
00098 
00099 void TGUtil::MakeExpBins(const TFltPrV& XYValV, TFltPrV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
00100   TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
00101 }
00102 
00103 void TGUtil::MakeExpBins(const TFltKdV& XYValV, TFltKdV& ExpXYValV, const double& BinFactor, const double& MinYVal) {
00104   TGnuPlot::MakeExpBins(XYValV, ExpXYValV, BinFactor, MinYVal);
00105 }
00106 
00107 void TGUtil::MakeExpBins(const TFltV& YValV, TFltV& ExpYValV, const double& BinFactor) {
00108   ExpYValV.Clr(true);
00109   int prevI=0;
00110   for (int i = 0; i < YValV.Len(); ) {
00111     ExpYValV.Add(YValV[i]);
00112     i = int(i*BinFactor);
00113     if (i==prevI) { i++; }
00114     prevI = i;
00115   }
00116 }
00117 
00118 void TGUtil::MakeExpBins(const TIntV& YValV, TIntV& ExpYValV, const double& BinFactor) {
00119   ExpYValV.Clr(true);
00120   int prevI=0;
00121   for (int i = 0; i < YValV.Len(); ) {
00122     ExpYValV.Add(YValV[i]);
00123     i = int(i*BinFactor);
00124     if (i==prevI) { i++; }
00125     prevI = i;
00126   }
00127 }
00128 
00130 // String helper functions and utilities
00131 // get <TagNm>TagVal</TagNm>
00132 TChA& TStrUtil::GetXmlTagVal(TXmlLx& XmlLx, const TChA& TagNm) {
00133   static TChA TagVal;
00134   EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
00135   EAssertR(TagNm == XmlLx.TagNm.CStr(), TagNm);
00136   const TXmlLxSym NextSym = XmlLx.GetSym();
00137   TagVal = XmlLx.TxtChA;
00138   if (NextSym == xsyStr) {
00139     EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
00140   } else {
00141     EAssertR(NextSym == xsyETag, TagNm); // empty tag
00142     //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
00143   }
00144   EAssertR(XmlLx.TagNm == TagNm, TagNm);
00145   return TagVal;
00146 }
00147 
00148 // get <TagNm>TagVal</TagNm>
00149 void TStrUtil::GetXmlTagNmVal(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal) {
00150   EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
00151   TagNm = XmlLx.TagNm;
00152   const TXmlLxSym NextSym = XmlLx.GetSym();
00153   TagVal = XmlLx.TxtChA;
00154   if (NextSym == xsyStr) {
00155     EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
00156   } else {
00157     EAssertR(NextSym == xsyETag, TagNm); // empty tag
00158     //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
00159   }
00160 }
00161 
00162 // get <TagNm>*</TagNm> (can be many tags inbetween
00163 bool TStrUtil::GetXmlTagNmVal2(TXmlLx& XmlLx, TChA& TagNm, TChA& TagVal, const bool& TakeTagNms) {
00164   if (XmlLx.GetSym() != xsySTag) {
00165     return false; }
00166   TagVal.Clr();
00167   TagNm = XmlLx.TagNm;
00168   //const TXmlLxSym NextSym = XmlLx.GetSym();
00169   while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
00170     if (TakeTagNms) {
00171       TagVal += XmlLx.TxtChA; }
00172     else if (XmlLx.Sym == xsyStr) {
00173       TagVal += XmlLx.TxtChA; }
00174     XmlLx.GetSym();
00175   }
00176   return true;
00177   //if (NextSym == xsyStr) {
00178   //  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
00179   //} else {
00180   //  EAssertR(NextSym == xsyETag, TagNm); // empty tag
00181   //  printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
00182   //}
00183 }
00184 
00185 
00186 // http://www.ijs.si/fdfd/blah.html --> www.ijs.si
00187 TChA TStrUtil::GetDomNm(const TChA& UrlChA) {
00188   int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
00189   if (EndSlash > 0) {
00190     const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
00191     if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); }
00192     else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); }
00193   } else {
00194     if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); }
00195     EndSlash = UrlChA.SearchCh('/', 0);
00196     if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); }
00197     else { return TChA(UrlChA).ToLc(); }
00198   }
00199 }
00200 // get domain name and also strip starting www.
00201 TChA TStrUtil::GetDomNm2(const TChA& UrlChA) {
00202   TChA Dom = GetDomNm(UrlChA);
00203   if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); }
00204   else { return Dom; }
00205 }
00206 
00207 int GetNthOccurence(const TChA& Url, const int& Count, const char Ch='/') {
00208   const char *c = Url.CStr();
00209   int cnt = 0;
00210   while (*c && cnt != Count) {
00211     if (*c == Ch) { cnt++; }
00212     c++;
00213   }
00214   return int(c-Url.CStr()-1);
00215 }
00216 
00217 // get website (GetDomNm2 or blog url)
00218 TChA TStrUtil::GetWebsiteNm(const TChA& PostUrlStr) {
00219   TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr);
00220   // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539
00221   if (DomNm == "blog.myspace.com") {
00222     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1);
00223   }
00224   // For these websites take the domain name and 1st directory: http://blogs.msdn.com/squasta
00225   // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx
00226   // http://ameblo.jp/baptism/entry-10126216277.html
00227   // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q
00228   // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php
00229   // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
00230   // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
00231   // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
00232   // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
00233   // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
00234   // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
00235   // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
00236   // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
00237   // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
00238   // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
00239   // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
00240   // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
00241   // http://blogs.zdnet.com/hardware/?p=2391
00242   // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php
00243   // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html
00244   // http://blog.tv2.dk/ole.mork/entry254689.html
00245   // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp
00246   // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html
00247   // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo
00248   if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com"
00249     || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co"
00250     || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net"
00251     || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com"
00252     || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk"
00253     || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") {
00254       return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1);
00255   }
00256   // http://digg.com/submit?phase=2&amp;url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&amp;title=and
00257   // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show
00258   if (DomNm == "digg.com") {
00259     if (PostUrlStr.IsPrefix("http://digg.com/submit?")) {
00260       const int Url = PostUrlStr.SearchStr(";url=");
00261       if (Url != -1) {
00262         return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); }
00263     } else {
00264       return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); }
00265   }
00266   // For these websites take the domain name and 2 directories: http://bbc.co.uk/blogs/thereporters/
00267   // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html
00268   // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
00269   // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas
00270   // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
00271   if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/")
00272     || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) {
00273     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
00274   }
00275   // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640
00276   if (DomNm=="feeds.feedburner.com") {
00277     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
00278   }
00279   // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c
00280   if (DomNm=="groups.google.com") {
00281     return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
00282   }
00283   // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa
00284   if (DomNm=="news.google.com") { // redirect
00285     const int UrlPos = PostUrlStr.SearchStr("&url=");
00286     if (UrlPos != -1) {
00287       return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); }
00288   }
00289   // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de
00290   if (DomNm == "bloggrevyen.no") { // redirect
00291     const int Http2 = PostUrlStr.SearchStr("/http://");
00292     if (Http2!=-1) {
00293       return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); }
00294   }
00295   //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn
00296   //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha
00297   if (DomNm.IsSuffix(".rd.yahoo.com")) {
00298     const int Http2 = PostUrlStr.SearchStr("/*");
00299     if (Http2!=-1) {
00300       return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); }
00301   }
00302   return DomNm;
00303 }
00304 
00306 bool TStrUtil::GetNormalizedUrl(const TChA& UrlIn, const TChA& BaseUrl, TChA& UrlOut) {
00307   UrlOut = UrlIn;
00308   if (StripEnd(UrlIn, "/", UrlOut)) {}
00309   else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
00310   else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
00311   else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
00312   if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
00313     // if UrlIn is relative url, try combine it with BaseUrl
00314     if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
00315       //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
00316       return false; }
00317     TChA Out;
00318     if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; }
00319     if (UrlIn[0] != '/') { Out.AddCh('/'); }
00320     Out += UrlOut;
00321     UrlOut = Out;
00322   }
00323   // http://www. --> http://
00324   if (UrlOut.IsPrefix("http://www.")) {
00325     UrlOut = TChA("http://") + UrlOut.GetSubStr(11, TInt::Mx);
00326   }
00327   UrlOut.ToLc();
00328   return true;
00329 }
00330 
00331 bool TStrUtil::StripEnd(const TChA& Str, const TChA& SearchStr, TChA& NewStr) {
00332   const int StrLen = Str.Len();
00333   const int SearchStrLen = SearchStr.Len();
00334   if (StrLen < SearchStrLen) { return false; }
00335   for (int i = 0; i < SearchStrLen; i++) {
00336     if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { return false; }
00337   }
00338   NewStr = Str.GetSubStr(0, StrLen-SearchStrLen-1);
00339   return true;
00340 }
00341 
00342 TChA TStrUtil::GetShorStr(const TChA& LongStr, const int MaxLen) {
00343   if (LongStr.Len() < MaxLen) { return LongStr; }
00344   TChA Str = LongStr.GetSubStr(0, MaxLen-1);
00345   Str += "...";
00346   return Str;
00347 }
00348 
00349 // space separated sequence of words, remove all punctuations, etc.
00350 TChA TStrUtil::GetCleanWrdStr(const TChA& ChA) {
00351   char *b = (char *) ChA.CStr();
00352   while (*b && ! TCh::IsAlNum(*b)) { b++; }
00353   if (*b == 0) { return TChA(); }
00354   TChA OutChA(ChA.Len());
00355   char *e = b, tmp;
00356   while (*e) {
00357     b = e;
00358     while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; }
00359     if (b < e) {
00360       tmp = *e; *e=0;
00361       OutChA += b;  OutChA.AddCh(' ');
00362       *e = tmp;
00363     }
00364     while (*e && ! TCh::IsAlNum(*e)) { e++; }
00365     if (! *e) { break; }
00366   }
00367   OutChA.DelLastCh();  OutChA.ToLc();
00368   return OutChA;
00369 }
00370 
00371 // space seprated sequence of words (includes all non-blank characters, i.e., punctuations)
00372 TChA TStrUtil::GetCleanStr(const TChA& ChA) {
00373   char *b = (char *) ChA.CStr();
00374   while (*b && ! TCh::IsAlNum(*b)) { b++; }
00375   if (*b == 0) { return TChA(); }
00376   TChA OutChA(ChA.Len());
00377   char *e = b;
00378   bool ws=false;
00379   while (*e) {
00380     while (*e && TCh::IsWs(*e)) { e++; ws=true; }
00381     if (! *e) { break; }
00382     if (ws) { OutChA.AddCh(' '); ws=false; }
00383     OutChA.AddCh(*e);
00384     e++;
00385   }
00386   //OutChA.ToLc();
00387   return OutChA;
00388 }
00389 int TStrUtil::CountWords(const TChA& ChA) {
00390   return CountWords(ChA.CStr());
00391 }
00392 
00393 int TStrUtil::CountWords(const char* CStr) {
00394   int WrdCnt = 1;
00395   for (const char *c = CStr; *c; c++) {
00396     if (TCh::IsWs(*c)) { WrdCnt++; }
00397   }
00398   return WrdCnt;
00399 }
00400 
00401 int TStrUtil::CountWords(const TChA& ChA, const TStrHash<TInt>& StopWordH) {
00402   TChA Tmp;
00403   TVec<char *> WrdV;
00404   SplitWords(Tmp, WrdV);
00405   int SWordCnt = 0;
00406   for (int w = 0; w < WrdV.Len(); w++) {
00407     if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; }
00408   }
00409   return WrdV.Len() - SWordCnt;
00410 }
00411 
00412 int TStrUtil::SplitWords(TChA& ChA, TVec<char *>& WrdV, const bool& SplitOnWs) {
00413   WrdV.Clr(false);
00414   WrdV.Add(ChA.CStr());
00415   for (char *c = (char *) ChA.CStr(); *c; c++) {
00416     if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) {
00417       *c = 0;
00418       if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
00419       WrdV.Add(c+1);
00420     }
00421   }
00422   return WrdV.Len();
00423 }
00424 
00425 int TStrUtil::SplitOnCh(TChA& ChA, TVec<char *>& WrdV, const char& Ch, const bool& SkipEmpty) {
00426   WrdV.Clr(false);
00427   WrdV.Add(ChA.CStr());
00428   for (char *c = (char *) ChA.CStr(); *c; c++) {
00429     if (*c == Ch) {
00430       *c = 0;
00431       if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
00432       WrdV.Add(c+1);
00433     }
00434   }
00435   if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
00436   return WrdV.Len();
00437 }
00438 
00439 int TStrUtil::SplitLines(TChA& ChA, TVec<char *>& LineV, const bool& SkipEmpty) {
00440   LineV.Clr(false);
00441   LineV.Add(ChA.CStr());
00442   bool IsChs=false;
00443   for (char *c = (char *) ChA.CStr(); *c; c++) {
00444     if (*c == '\n') {
00445       if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n
00446       *c=0;
00447       if (SkipEmpty) {
00448         if (IsChs) { LineV.Add(c+1); }
00449       } else {
00450         LineV.Add(c+1);
00451       }
00452       IsChs=false;
00453     } else {
00454       IsChs=true;
00455     }
00456   }
00457   return LineV.Len();
00458 }
00459 
00460 int TStrUtil::SplitSentences(TChA& ChA, TVec<char *>& SentenceV) {
00461   SentenceV.Clr();
00462   const char *B = ChA.CStr();
00463   const char *E = B+ChA.Len();
00464   char *c = (char *) B;
00465   while (*c && TCh::IsWs(*c)) { c++; }
00466   if (*c) { SentenceV.Add(c); } else { return 0; }
00467   for (; c < E; c++) {
00468     if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence
00469       if (c<E && *(c+1)=='"') { *c='"';  c++; } // blah." --> blah"
00470       if (c>=E) { continue; }
00471       *c=0;  c++;
00472       char *e = c-1;
00473       while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars
00474       while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum
00475       if (c<E) { SentenceV.Add(c); }
00476     }
00477   }
00478   return SentenceV.Len();
00479 }
00480 
00481 void TStrUtil::RemoveHtmlTags(const TChA& HtmlStr, TChA& TextStr) {
00482   TextStr.Clr();
00483   char *StrB, *StrE;
00484   // use full page html: skip till <body>
00485   //PageHtmlStr = "<script fdsfs>  fsdfsd </script> jure";
00486   /*if (UseFullHtml) {
00487     StrB = PageHtmlStr.CStr();
00488     StrE = StrB+PageHtmlStr.Len();
00489     char * NewB = strstr(StrB, "<body>");
00490     if (NewB != NULL) { StrB = NewB+6; }
00491     char * NewE = strstr(StrB, "body>");
00492     if (NewE != NULL) {
00493       while (true) {
00494         char *E=strstr(NewE+4, "body>");
00495         if (E == NULL) { break; }  NewE = E; }
00496       StrE = NewE;
00497     }
00498   } else {  // only extracted post html*/
00499   StrB = (char *) HtmlStr.CStr();
00500   StrE = (char *) StrB+HtmlStr.Len(); //}
00501   for (char *e = StrB; e < StrE; ) {
00502     char* b = e;
00503     while (e<StrE && *e != '<') { e++; }
00504     // copy text
00505     char tmp=*e;  *e = 0;
00506     TextStr+= b; TextStr.AddCh(' ');  *e = tmp;
00507     if (e >= StrE) { return; }
00508     // if start of a comment: skip
00509     if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment
00510       e += 3;
00511       while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; }
00512       e++;  continue;
00513     }
00514     // if "<script" then skip
00515     if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') {
00516       e += 5;
00517       while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; }
00518       e++;  continue;
00519     }
00520     // skip to end of tag
00521     while (e < StrE && *e != '>') { e++; }
00522     if (e>=StrE) { return; }
00523     e++;
00524   }
00525 }
00526 
00527 bool TStrUtil::IsLatinStr(const TChA& Str, const double& MinAlFrac) {
00528   int AlNumCnt=0, ChCnt=0;
00529   for (const char *c = Str.CStr(); *c; c++) {
00530     if (TCh::IsWs(*c)) { continue; }
00531     if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
00532     ChCnt++;
00533   }
00534   if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
00535   return false;
00536 }
00537 
00538 void TStrUtil::GetWIdV(const TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
00539   const int NotWId = -1;
00540   TChA ChA(CStr);
00541   TVec<char *> WrdV;
00542   TInt WId;
00543   TStrUtil::SplitWords(ChA, WrdV);
00544   WIdV.Clr(false);
00545   for (int w = 0; w < WrdV.Len(); w++) {
00546     if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
00547     else { WIdV.Add(NotWId); }
00548   }
00549 }
00550 
00551 // and words to StrH and get a vector of word ids
00552 void TStrUtil::GetAddWIdV(TStrHash<TInt>& StrH, const char *CStr, TIntV& WIdV) {
00553   TChA ChA(CStr);
00554   TVec<char *> WrdV;
00555   TInt WId;
00556   TStrUtil::SplitWords(ChA, WrdV);
00557   WIdV.Clr(false);
00558   for (int w = 0; w < WrdV.Len(); w++) {
00559     WIdV.Add(StrH.AddDatId(WrdV[w]));
00560   }
00561 }
00562 
00563 // Parse time in various formats:
00564 //   10:16, 16 Sep 2004
00565 //   10:20, 2004 Sep 16
00566 //   2005-07-07 20:30:35
00567 //   23:24:07, 2005-07-10
00568 //   9 July 2005 14:38
00569 //   21:16, July 9, 2005
00570 //   06:02, 10 July 2005
00571 bool TStrUtil::GetTmFromStr(const char* TmStr, TSecTm& Tm) {
00572   static TStrV MonthV1, MonthV2;
00573   if (MonthV1.Empty()) {
00574     TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1);
00575     TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2);
00576   }
00577   TChA Tmp(TmStr);
00578   Tmp.ToLc();
00579   TVec<char *> WrdV;
00580   const char* End = Tmp.CStr()+Tmp.Len();
00581   int Col = -1, Cols=0;
00582   for (char *b = Tmp.CStr(); b <End; ) {
00583     WrdV.Add(b);
00584     while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
00585     if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++;  }
00586     *b=0; b++;
00587     while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
00588   }
00589   if (Cols == 2) {
00590     if (Col+1 >= WrdV.Len()) { return false; }
00591     WrdV.Del(Col+1);
00592   }
00593   if (Col<1) { return false; }
00594   const int Hr = atoi(WrdV[Col-1]);
00595   const int Min = atoi(WrdV[Col]);
00596   WrdV.Del(Col);  WrdV.Del(Col-1);
00597   if (WrdV.Len() != 3) { return false; }
00598   int y=0,m=1,d=2, Mon=-1;
00599   if (TCh::IsAlpha(WrdV[0][0])) {
00600     y=2; m=0; d=1;
00601   } else if (TCh::IsAlpha(WrdV[1][0])) {
00602     y=2; m=1; d=0;
00603   } else if (TCh::IsAlpha(WrdV[2][0])) {
00604     y=0; m=2; d=1;
00605   } else {
00606     y=0; m=1; d=2;
00607     Mon = atoi(WrdV[m]);
00608   }
00609   int Day = atoi(WrdV[d]);
00610   if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; }
00611   if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; }
00612   if (Mon == 0) { return false; }
00613   int Year = atoi(WrdV[y]);
00614   if (Day > Year) { ::Swap(Day, Year); }
00615   //printf("%d-%02d-%02d  %02d:%02d\n", Year, Mon, Day, Hr, Min);
00616   Tm = TSecTm(Year, Mon, Day, Hr, Min, 0);
00617   return true;
00618 }
00619 
00620 // Standardize first and lastnames into <last_name>_<first name innitial>
00621 TStr TStrUtil::GetStdName(TStr AuthorName) {
00622   TStr StdName;
00623   AuthorName.ToLc();
00624   AuthorName.ChangeChAll('\n', ' ');
00625   AuthorName.ChangeChAll('.', ' ');
00626   // if there is a number in the name, remove it and everything after it
00627   int i, pos = 0;
00628   while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
00629     pos++; }
00630   if (pos < AuthorName.Len()) {
00631     AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); }
00632   if (AuthorName.Empty()) { return TStr::GetNullStr(); }
00633 
00634   // replace everything after '('
00635   int b = AuthorName.SearchCh('(');
00636   if (b != -1) {
00637     AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); }
00638   // skip if contains ')'
00639   if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); }
00640   // skip if it is not a name
00641   if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
00642    || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
00643     return TStr::GetNullStr();
00644   }
00645   // remove all non-letters (latex tags, ...)
00646   TChA NewName;
00647   for (i = 0; i < AuthorName.Len(); i++) {
00648     const char Ch = AuthorName[i];
00649     if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; }
00650   }
00651   StdName = NewName;  StdName.ToTrunc();
00652   TStrV AuthNmV; StdName.SplitOnWs(AuthNmV);
00653   // too short -- not a name
00654   if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
00655   if (AuthNmV.Len() < 2) return TStr::GetNullStr();
00656 
00657   const TStr LastNm = AuthNmV.Last();
00658   if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();
00659 
00660   IAssert(isalpha(AuthNmV[0][0]));
00661   return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
00662 }
00663 
00664 void TStrUtil::GetStdNameV(TStr AuthorNames, TStrV& StdNameV) {
00665   AuthorNames.ChangeChAll('\n', ' ');
00666   AuthorNames.ToLc();
00667   // split into author names
00668   TStrV AuthV, TmpV, Tmp2V;
00669   // split on 'and'
00670   AuthorNames.SplitOnStr(" and ", TmpV);
00671   int i;
00672   for (i = 0; i < TmpV.Len(); i++) {
00673     TmpV[i].SplitOnAllCh(',', Tmp2V);  AuthV.AddV(Tmp2V); }
00674   // split on '&'
00675   TmpV = AuthV;  AuthV.Clr();
00676   for (i = 0; i < TmpV.Len(); i++) {
00677     TmpV[i].SplitOnAllCh('&', Tmp2V);  AuthV.AddV(Tmp2V); }
00678   // split on ','
00679   TmpV = AuthV;  AuthV.Clr();
00680   for (i = 0; i < TmpV.Len(); i++) {
00681     TmpV[i].SplitOnAllCh(',', Tmp2V);  AuthV.AddV(Tmp2V); }
00682   // split on ';'
00683   TmpV = AuthV;  AuthV.Clr();
00684   for (i = 0; i < TmpV.Len(); i++) {
00685     TmpV[i].SplitOnAllCh(';', Tmp2V);  AuthV.AddV(Tmp2V); }
00686   // standardize names
00687   StdNameV.Clr();
00688   //printf("\n*** %s\n", AuthorNames.CStr());
00689   for (i = 0; i < AuthV.Len(); i++) {
00690     TStr StdName = GetStdName(AuthV[i]);
00691     if (! StdName.Empty()) {
00692       //printf("\t%s  ==>  %s\n", AuthV[i].CStr(), StdName.CStr());
00693       StdNameV.Add(StdName);
00694     }
00695   }
00696 }