SNAP Library 2.1, Developer Reference  2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
TStrUtil Class Reference

String helper functions and utilities. Quick and ditry! More...

#include <util.h>

List of all members.

Static Public Member Functions

static TChAGetXmlTagVal (TXmlLx &XmlLx, const TChA &TagNm)
static void GetXmlTagNmVal (TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal)
static bool GetXmlTagNmVal2 (TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal, const bool &TakeTagNms)
static TChA GetDomNm (const TChA &UrlChA)
static TChA GetDomNm2 (const TChA &UrlChA)
static TChA GetWebsiteNm (const TChA &UrlChA)
static bool GetNormalizedUrl (const TChA &UrlIn, const TChA &BaseUrl, TChA &UrlOut)
 Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www.
static bool StripEnd (const TChA &Str, const TChA &SearchStr, TChA &NewStr)
static TChA GetShorStr (const TChA &LongStr, const int MaxLen=50)
static TChA GetCleanStr (const TChA &ChA)
static TChA GetCleanWrdStr (const TChA &ChA)
static int CountWords (const char *CStr)
static int CountWords (const TChA &ChA)
static int CountWords (const TChA &ChA, const TStrHash< TInt > &StopWordH)
static int SplitWords (TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
static int SplitOnCh (TChA &ChA, TVec< char * > &WrdV, const char &Ch, const bool &SkipEmpty=false)
static int SplitLines (TChA &ChA, TVec< char * > &LineV, const bool &SkipEmpty=false)
static int SplitSentences (TChA &ChA, TVec< char * > &SentenceV)
static void RemoveHtmlTags (const TChA &HtmlStr, TChA &TextStr)
static bool IsLatinStr (const TChA &Str, const double &MinAlFrac)
static void GetWIdV (const TStrHash< TInt > &StrH, const char *CStr, TIntV &WIdV)
static void GetAddWIdV (TStrHash< TInt > &StrH, const char *CStr, TIntV &WIdV)
static bool GetTmFromStr (const char *TmStr, TSecTm &Tm)
 Parses time in many different text formats. See source code for details.
static TStr GetStdName (TStr AuthorName)
 Puts person's name (fist middle last) in a standard form: <last_name>_<first name innitial>
static void GetStdNameV (TStr AuthorNames, TStrV &StdNameV)
 Splits a list of people's names.

Detailed Description

String helper functions and utilities. Quick and ditry!

Definition at line 34 of file util.h.


Member Function Documentation

int TStrUtil::CountWords ( const char *  CStr) [static]

Definition at line 393 of file util.cpp.

References TCh::IsWs().

Referenced by CountWords().

                                         {
  int WrdCnt = 1;
  for (const char *c = CStr; *c; c++) {
    if (TCh::IsWs(*c)) { WrdCnt++; }
  }
  return WrdCnt;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int TStrUtil::CountWords ( const TChA ChA) [static]

Definition at line 389 of file util.cpp.

References CountWords(), and TChA::CStr().

                                        {
  return CountWords(ChA.CStr());
}

Here is the call graph for this function:

int TStrUtil::CountWords ( const TChA ChA,
const TStrHash< TInt > &  StopWordH 
) [static]

Definition at line 401 of file util.cpp.

References TStrHash< TDat, TStringPool, THashFunc >::IsKey(), TVec< TVal, TSizeTy >::Len(), and SplitWords().

                                                                         {
  TChA Tmp;
  TVec<char *> WrdV;
  SplitWords(Tmp, WrdV);
  int SWordCnt = 0;
  for (int w = 0; w < WrdV.Len(); w++) {
    if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; }
  }
  return WrdV.Len() - SWordCnt;
}

Here is the call graph for this function:

void TStrUtil::GetAddWIdV ( TStrHash< TInt > &  StrH,
const char *  CStr,
TIntV WIdV 
) [static]

Definition at line 552 of file util.cpp.

                                                                             {
  TChA ChA(CStr);
  TVec<char *> WrdV;
  TInt WId;
  TStrUtil::SplitWords(ChA, WrdV);
  WIdV.Clr(false);
  for (int w = 0; w < WrdV.Len(); w++) {
    WIdV.Add(StrH.AddDatId(WrdV[w]));
  }
}
TChA TStrUtil::GetCleanStr ( const TChA ChA) [static]

Definition at line 372 of file util.cpp.

References TChA::AddCh(), TChA::CStr(), TCh::IsAlNum(), TCh::IsWs(), and TChA::Len().

                                          {
  char *b = (char *) ChA.CStr();
  while (*b && ! TCh::IsAlNum(*b)) { b++; }
  if (*b == 0) { return TChA(); }
  TChA OutChA(ChA.Len());
  char *e = b;
  bool ws=false;
  while (*e) {
    while (*e && TCh::IsWs(*e)) { e++; ws=true; }
    if (! *e) { break; }
    if (ws) { OutChA.AddCh(' '); ws=false; }
    OutChA.AddCh(*e);
    e++;
  }
  //OutChA.ToLc();
  return OutChA;
}

Here is the call graph for this function:

TChA TStrUtil::GetCleanWrdStr ( const TChA ChA) [static]

Definition at line 350 of file util.cpp.

References TChA::AddCh(), TChA::CStr(), TCh::IsAlNum(), and TChA::Len().

                                             {
  char *b = (char *) ChA.CStr();
  while (*b && ! TCh::IsAlNum(*b)) { b++; }
  if (*b == 0) { return TChA(); }
  TChA OutChA(ChA.Len());
  char *e = b, tmp;
  while (*e) {
    b = e;
    while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; }
    if (b < e) {
      tmp = *e; *e=0;
      OutChA += b;  OutChA.AddCh(' ');
      *e = tmp;
    }
    while (*e && ! TCh::IsAlNum(*e)) { e++; }
    if (! *e) { break; }
  }
  OutChA.DelLastCh();  OutChA.ToLc();
  return OutChA;
}

Here is the call graph for this function:

TChA TStrUtil::GetDomNm ( const TChA UrlChA) [static]

Definition at line 187 of file util.cpp.

References TChA::GetSubStr(), TChA::IsPrefix(), TChA::Len(), TChA::SearchCh(), TChA::SearchChBack(), and TChA::ToLc().

Referenced by GetDomNm2().

                                          {
  int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
  if (EndSlash > 0) {
    const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
    if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); }
    else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); }
  } else {
    if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); }
    EndSlash = UrlChA.SearchCh('/', 0);
    if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); }
    else { return TChA(UrlChA).ToLc(); }
  }
}

Here is the call graph for this function:

Here is the caller graph for this function:

TChA TStrUtil::GetDomNm2 ( const TChA UrlChA) [static]

Definition at line 201 of file util.cpp.

References GetDomNm(), TChA::GetSubStr(), TChA::IsPrefix(), and TInt::Mx.

Referenced by GetWebsiteNm().

                                           {
  TChA Dom = GetDomNm(UrlChA);
  if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); }
  else { return Dom; }
}

Here is the call graph for this function:

Here is the caller graph for this function:

bool TStrUtil::GetNormalizedUrl ( const TChA UrlIn,
const TChA BaseUrl,
TChA UrlOut 
) [static]

Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www.

Definition at line 306 of file util.cpp.

References TChA::AddCh(), TChA::Empty(), TChA::GetSubStr(), TChA::IsPrefix(), TInt::Mx, StripEnd(), and TChA::ToLc().

                                                                                    {
  UrlOut = UrlIn;
  if (StripEnd(UrlIn, "/", UrlOut)) {}
  else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
  else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
  else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
  if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
    // if UrlIn is relative url, try combine it with BaseUrl
    if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
      //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
      return false; }
    TChA Out;
    if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; }
    if (UrlIn[0] != '/') { Out.AddCh('/'); }
    Out += UrlOut;
    UrlOut = Out;
  }
  // http://www. --> http://
  if (UrlOut.IsPrefix("http://www.")) {
    UrlOut = TChA("http://") + UrlOut.GetSubStr(11, TInt::Mx);
  }
  UrlOut.ToLc();
  return true;
}

Here is the call graph for this function:

TChA TStrUtil::GetShorStr ( const TChA LongStr,
const int  MaxLen = 50 
) [static]

Definition at line 342 of file util.cpp.

References TChA::GetSubStr(), and TChA::Len().

                                                               {
  if (LongStr.Len() < MaxLen) { return LongStr; }
  TChA Str = LongStr.GetSubStr(0, MaxLen-1);
  Str += "...";
  return Str;
}

Here is the call graph for this function:

TStr TStrUtil::GetStdName ( TStr  AuthorName) [static]

Puts person's name (fist middle last) in a standard form: <last_name>_<first name innitial>

Definition at line 621 of file util.cpp.

                                         {
  TStr StdName;
  AuthorName.ToLc();
  AuthorName.ChangeChAll('\n', ' ');
  AuthorName.ChangeChAll('.', ' ');
  // if there is a number in the name, remove it and everything after it
  int i, pos = 0;
  while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
    pos++; }
  if (pos < AuthorName.Len()) {
    AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); }
  if (AuthorName.Empty()) { return TStr::GetNullStr(); }

  // replace everything after '('
  int b = AuthorName.SearchCh('(');
  if (b != -1) {
    AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); }
  // skip if contains ')'
  if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); }
  // skip if it is not a name
  if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
   || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
    return TStr::GetNullStr();
  }
  // remove all non-letters (latex tags, ...)
  TChA NewName;
  for (i = 0; i < AuthorName.Len(); i++) {
    const char Ch = AuthorName[i];
    if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; }
  }
  StdName = NewName;  StdName.ToTrunc();
  TStrV AuthNmV; StdName.SplitOnWs(AuthNmV);
  // too short -- not a name
  if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
  if (AuthNmV.Len() < 2) return TStr::GetNullStr();

  const TStr LastNm = AuthNmV.Last();
  if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();

  IAssert(isalpha(AuthNmV[0][0]));
  return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
}
void TStrUtil::GetStdNameV ( TStr  AuthorNames,
TStrV StdNameV 
) [static]

Splits a list of people's names.

Definition at line 664 of file util.cpp.

                                                            {
  AuthorNames.ChangeChAll('\n', ' ');
  AuthorNames.ToLc();
  // split into author names
  TStrV AuthV, TmpV, Tmp2V;
  // split on 'and'
  AuthorNames.SplitOnStr(" and ", TmpV);
  int i;
  for (i = 0; i < TmpV.Len(); i++) {
    TmpV[i].SplitOnAllCh(',', Tmp2V);  AuthV.AddV(Tmp2V); }
  // split on '&'
  TmpV = AuthV;  AuthV.Clr();
  for (i = 0; i < TmpV.Len(); i++) {
    TmpV[i].SplitOnAllCh('&', Tmp2V);  AuthV.AddV(Tmp2V); }
  // split on ','
  TmpV = AuthV;  AuthV.Clr();
  for (i = 0; i < TmpV.Len(); i++) {
    TmpV[i].SplitOnAllCh(',', Tmp2V);  AuthV.AddV(Tmp2V); }
  // split on ';'
  TmpV = AuthV;  AuthV.Clr();
  for (i = 0; i < TmpV.Len(); i++) {
    TmpV[i].SplitOnAllCh(';', Tmp2V);  AuthV.AddV(Tmp2V); }
  // standardize names
  StdNameV.Clr();
  //printf("\n*** %s\n", AuthorNames.CStr());
  for (i = 0; i < AuthV.Len(); i++) {
    TStr StdName = GetStdName(AuthV[i]);
    if (! StdName.Empty()) {
      //printf("\t%s  ==>  %s\n", AuthV[i].CStr(), StdName.CStr());
      StdNameV.Add(StdName);
    }
  }
}
bool TStrUtil::GetTmFromStr ( const char *  TmStr,
TSecTm Tm 
) [static]

Parses time in many different text formats. See source code for details.

Definition at line 571 of file util.cpp.

                                                         {
  static TStrV MonthV1, MonthV2;
  if (MonthV1.Empty()) {
    TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1);
    TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2);
  }
  TChA Tmp(TmStr);
  Tmp.ToLc();
  TVec<char *> WrdV;
  const char* End = Tmp.CStr()+Tmp.Len();
  int Col = -1, Cols=0;
  for (char *b = Tmp.CStr(); b <End; ) {
    WrdV.Add(b);
    while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
    if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++;  }
    *b=0; b++;
    while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
  }
  if (Cols == 2) {
    if (Col+1 >= WrdV.Len()) { return false; }
    WrdV.Del(Col+1);
  }
  if (Col<1) { return false; }
  const int Hr = atoi(WrdV[Col-1]);
  const int Min = atoi(WrdV[Col]);
  WrdV.Del(Col);  WrdV.Del(Col-1);
  if (WrdV.Len() != 3) { return false; }
  int y=0,m=1,d=2, Mon=-1;
  if (TCh::IsAlpha(WrdV[0][0])) {
    y=2; m=0; d=1;
  } else if (TCh::IsAlpha(WrdV[1][0])) {
    y=2; m=1; d=0;
  } else if (TCh::IsAlpha(WrdV[2][0])) {
    y=0; m=2; d=1;
  } else {
    y=0; m=1; d=2;
    Mon = atoi(WrdV[m]);
  }
  int Day = atoi(WrdV[d]);
  if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; }
  if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; }
  if (Mon == 0) { return false; }
  int Year = atoi(WrdV[y]);
  if (Day > Year) { ::Swap(Day, Year); }
  //printf("%d-%02d-%02d  %02d:%02d\n", Year, Mon, Day, Hr, Min);
  Tm = TSecTm(Year, Mon, Day, Hr, Min, 0);
  return true;
}
TChA TStrUtil::GetWebsiteNm ( const TChA UrlChA) [static]

Definition at line 218 of file util.cpp.

References GetDomNm2(), GetNthOccurence(), TChA::GetSubStr(), TChA::IsPrefix(), TChA::IsSuffix(), TChA::Len(), TChA::SearchCh(), and TChA::SearchStr().

                                                  {
  TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr);
  // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539
  if (DomNm == "blog.myspace.com") {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1);
  }
  // For these websites take the domain name and 1st directory: http://blogs.msdn.com/squasta
  // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx
  // http://ameblo.jp/baptism/entry-10126216277.html
  // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q
  // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php
  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
  // http://blogs.zdnet.com/hardware/?p=2391
  // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php
  // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html
  // http://blog.tv2.dk/ole.mork/entry254689.html
  // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp
  // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html
  // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo
  if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com"
    || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co"
    || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net"
    || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com"
    || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk"
    || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") {
      return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1);
  }
  // http://digg.com/submit?phase=2&amp;url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&amp;title=and
  // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show
  if (DomNm == "digg.com") {
    if (PostUrlStr.IsPrefix("http://digg.com/submit?")) {
      const int Url = PostUrlStr.SearchStr(";url=");
      if (Url != -1) {
        return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); }
    } else {
      return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); }
  }
  // For these websites take the domain name and 2 directories: http://bbc.co.uk/blogs/thereporters/
  // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html
  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
  // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas
  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
  if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/")
    || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
  }
  // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640
  if (DomNm=="feeds.feedburner.com") {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
  }
  // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c
  if (DomNm=="groups.google.com") {
    return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
  }
  // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa
  if (DomNm=="news.google.com") { // redirect
    const int UrlPos = PostUrlStr.SearchStr("&url=");
    if (UrlPos != -1) {
      return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); }
  }
  // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de
  if (DomNm == "bloggrevyen.no") { // redirect
    const int Http2 = PostUrlStr.SearchStr("/http://");
    if (Http2!=-1) {
      return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); }
  }
  //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn
  //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha
  if (DomNm.IsSuffix(".rd.yahoo.com")) {
    const int Http2 = PostUrlStr.SearchStr("/*");
    if (Http2!=-1) {
      return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); }
  }
  return DomNm;
}

Here is the call graph for this function:

void TStrUtil::GetWIdV ( const TStrHash< TInt > &  StrH,
const char *  CStr,
TIntV WIdV 
) [static]

Definition at line 538 of file util.cpp.

                                                                                {
  const int NotWId = -1;
  TChA ChA(CStr);
  TVec<char *> WrdV;
  TInt WId;
  TStrUtil::SplitWords(ChA, WrdV);
  WIdV.Clr(false);
  for (int w = 0; w < WrdV.Len(); w++) {
    if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
    else { WIdV.Add(NotWId); }
  }
}
void TStrUtil::GetXmlTagNmVal ( TXmlLx XmlLx,
TChA TagNm,
TChA TagVal 
) [static]

Definition at line 149 of file util.cpp.

References EAssertR, TXmlLx::GetSym(), TXmlLx::TagNm, TXmlLx::TxtChA, xsyETag, xsySTag, and xsyStr.

                                                                      {
  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
  TagNm = XmlLx.TagNm;
  const TXmlLxSym NextSym = XmlLx.GetSym();
  TagVal = XmlLx.TxtChA;
  if (NextSym == xsyStr) {
    EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
  } else {
    EAssertR(NextSym == xsyETag, TagNm); // empty tag
    //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
  }
}

Here is the call graph for this function:

bool TStrUtil::GetXmlTagNmVal2 ( TXmlLx XmlLx,
TChA TagNm,
TChA TagVal,
const bool &  TakeTagNms 
) [static]

Definition at line 163 of file util.cpp.

References TChA::Clr(), TChA::CStr(), TXmlLx::GetSym(), TXmlLx::Sym, TXmlLx::TagNm, TXmlLx::TxtChA, xsyETag, xsySTag, and xsyStr.

                                                                                               {
  if (XmlLx.GetSym() != xsySTag) {
    return false; }
  TagVal.Clr();
  TagNm = XmlLx.TagNm;
  //const TXmlLxSym NextSym = XmlLx.GetSym();
  while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
    if (TakeTagNms) {
      TagVal += XmlLx.TxtChA; }
    else if (XmlLx.Sym == xsyStr) {
      TagVal += XmlLx.TxtChA; }
    XmlLx.GetSym();
  }
  return true;
  //if (NextSym == xsyStr) {
  //  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
  //} else {
  //  EAssertR(NextSym == xsyETag, TagNm); // empty tag
  //  printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
  //}
}

Here is the call graph for this function:

TChA & TStrUtil::GetXmlTagVal ( TXmlLx XmlLx,
const TChA TagNm 
) [static]

Definition at line 132 of file util.cpp.

References TStr::CStr(), EAssertR, TXmlLx::GetSym(), TXmlLx::TagNm, TXmlLx::TxtChA, xsyETag, xsySTag, and xsyStr.

                                                             {
  static TChA TagVal;
  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
  EAssertR(TagNm == XmlLx.TagNm.CStr(), TagNm);
  const TXmlLxSym NextSym = XmlLx.GetSym();
  TagVal = XmlLx.TxtChA;
  if (NextSym == xsyStr) {
    EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
  } else {
    EAssertR(NextSym == xsyETag, TagNm); // empty tag
    //printf("  token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
  }
  EAssertR(XmlLx.TagNm == TagNm, TagNm);
  return TagVal;
}

Here is the call graph for this function:

bool TStrUtil::IsLatinStr ( const TChA Str,
const double &  MinAlFrac 
) [static]

Definition at line 527 of file util.cpp.

                                                                  {
  int AlNumCnt=0, ChCnt=0;
  for (const char *c = Str.CStr(); *c; c++) {
    if (TCh::IsWs(*c)) { continue; }
    if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
    ChCnt++;
  }
  if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
  return false;
}
void TStrUtil::RemoveHtmlTags ( const TChA HtmlStr,
TChA TextStr 
) [static]

Definition at line 481 of file util.cpp.

References TChA::AddCh(), TChA::Clr(), TChA::CStr(), and TChA::Len().

                                                                {
  TextStr.Clr();
  char *StrB, *StrE;
  // use full page html: skip till <body>
  //PageHtmlStr = "<script fdsfs>  fsdfsd </script> jure";
  /*if (UseFullHtml) {
    StrB = PageHtmlStr.CStr();
    StrE = StrB+PageHtmlStr.Len();
    char * NewB = strstr(StrB, "<body>");
    if (NewB != NULL) { StrB = NewB+6; }
    char * NewE = strstr(StrB, "body>");
    if (NewE != NULL) {
      while (true) {
        char *E=strstr(NewE+4, "body>");
        if (E == NULL) { break; }  NewE = E; }
      StrE = NewE;
    }
  } else {  // only extracted post html*/
  StrB = (char *) HtmlStr.CStr();
  StrE = (char *) StrB+HtmlStr.Len(); //}
  for (char *e = StrB; e < StrE; ) {
    char* b = e;
    while (e<StrE && *e != '<') { e++; }
    // copy text
    char tmp=*e;  *e = 0;
    TextStr+= b; TextStr.AddCh(' ');  *e = tmp;
    if (e >= StrE) { return; }
    // if start of a comment: skip
    if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment
      e += 3;
      while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; }
      e++;  continue;
    }
    // if "<script" then skip
    if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') {
      e += 5;
      while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; }
      e++;  continue;
    }
    // skip to end of tag
    while (e < StrE && *e != '>') { e++; }
    if (e>=StrE) { return; }
    e++;
  }
}

Here is the call graph for this function:

int TStrUtil::SplitLines ( TChA ChA,
TVec< char * > &  LineV,
const bool &  SkipEmpty = false 
) [static]

Definition at line 439 of file util.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), TChA::CStr(), and TVec< TVal, TSizeTy >::Len().

                                                                              {
  LineV.Clr(false);
  LineV.Add(ChA.CStr());
  bool IsChs=false;
  for (char *c = (char *) ChA.CStr(); *c; c++) {
    if (*c == '\n') {
      if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n
      *c=0;
      if (SkipEmpty) {
        if (IsChs) { LineV.Add(c+1); }
      } else {
        LineV.Add(c+1);
      }
      IsChs=false;
    } else {
      IsChs=true;
    }
  }
  return LineV.Len();
}

Here is the call graph for this function:

int TStrUtil::SplitOnCh ( TChA ChA,
TVec< char * > &  WrdV,
const char &  Ch,
const bool &  SkipEmpty = false 
) [static]

Definition at line 425 of file util.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), TChA::CStr(), TVec< TVal, TSizeTy >::DelLast(), TVec< TVal, TSizeTy >::Empty(), TVec< TVal, TSizeTy >::Last(), and TVec< TVal, TSizeTy >::Len().

                                                                                            {
  WrdV.Clr(false);
  WrdV.Add(ChA.CStr());
  for (char *c = (char *) ChA.CStr(); *c; c++) {
    if (*c == Ch) {
      *c = 0;
      if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
      WrdV.Add(c+1);
    }
  }
  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
  return WrdV.Len();
}

Here is the call graph for this function:

int TStrUtil::SplitSentences ( TChA ChA,
TVec< char * > &  SentenceV 
) [static]

Definition at line 460 of file util.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), TChA::CStr(), TCh::IsAlNum(), TCh::IsWs(), TChA::Len(), and TVec< TVal, TSizeTy >::Len().

                                                               {
  SentenceV.Clr();
  const char *B = ChA.CStr();
  const char *E = B+ChA.Len();
  char *c = (char *) B;
  while (*c && TCh::IsWs(*c)) { c++; }
  if (*c) { SentenceV.Add(c); } else { return 0; }
  for (; c < E; c++) {
    if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence
      if (c<E && *(c+1)=='"') { *c='"';  c++; } // blah." --> blah"
      if (c>=E) { continue; }
      *c=0;  c++;
      char *e = c-1;
      while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars
      while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum
      if (c<E) { SentenceV.Add(c); }
    }
  }
  return SentenceV.Len();
}

Here is the call graph for this function:

int TStrUtil::SplitWords ( TChA ChA,
TVec< char * > &  WrdV,
const bool &  SplitOnWs = true 
) [static]

Definition at line 412 of file util.cpp.

References TVec< TVal, TSizeTy >::Add(), TVec< TVal, TSizeTy >::Clr(), TChA::CStr(), TVec< TVal, TSizeTy >::DelLast(), TVec< TVal, TSizeTy >::Empty(), TCh::IsAlNum(), TVec< TVal, TSizeTy >::Last(), and TVec< TVal, TSizeTy >::Len().

Referenced by CountWords().

                                                                             {
  WrdV.Clr(false);
  WrdV.Add(ChA.CStr());
  for (char *c = (char *) ChA.CStr(); *c; c++) {
    if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) {
      *c = 0;
      if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
      WrdV.Add(c+1);
    }
  }
  return WrdV.Len();
}

Here is the call graph for this function:

Here is the caller graph for this function:

bool TStrUtil::StripEnd ( const TChA Str,
const TChA SearchStr,
TChA NewStr 
) [static]

Definition at line 331 of file util.cpp.

References TChA::GetSubStr(), and TChA::Len().

Referenced by GetNormalizedUrl().

                                                                            {
  const int StrLen = Str.Len();
  const int SearchStrLen = SearchStr.Len();
  if (StrLen < SearchStrLen) { return false; }
  for (int i = 0; i < SearchStrLen; i++) {
    if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { return false; }
  }
  NewStr = Str.GetSubStr(0, StrLen-SearchStrLen-1);
  return true;
}

Here is the call graph for this function:

Here is the caller graph for this function:


The documentation for this class was generated from the following files: