SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
TStrUtil Class Reference

String helper functions and utilities. Quick and ditry! More...

#include <util.h>

Static Public Member Functions

static TChAGetXmlTagVal (TXmlLx &XmlLx, const TChA &TagNm)
 
static void GetXmlTagNmVal (TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal)
 
static bool GetXmlTagNmVal2 (TXmlLx &XmlLx, TChA &TagNm, TChA &TagVal, const bool &TakeTagNms)
 
static TChA GetDomNm (const TChA &UrlChA)
 
static TChA GetDomNm2 (const TChA &UrlChA)
 
static TChA GetWebsiteNm (const TChA &UrlChA)
 
static bool GetNormalizedUrl (const TChA &UrlIn, const TChA &BaseUrl, TChA &UrlOut)
 Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www. More...
 
static bool StripEnd (const TChA &Str, const TChA &SearchStr, TChA &NewStr)
 
static TChA GetShorStr (const TChA &LongStr, const int MaxLen=50)
 
static TChA GetCleanStr (const TChA &ChA)
 
static TChA GetCleanWrdStr (const TChA &ChA)
 
static int CountWords (const char *CStr)
 
static int CountWords (const TChA &ChA)
 
static int CountWords (const TChA &ChA, const TStrHash< TInt > &StopWordH)
 
static int SplitWords (TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
 
static int SplitOnCh (TChA &ChA, TVec< char * > &WrdV, const char &Ch, const bool &SkipEmpty=false)
 
static int SplitLines (TChA &ChA, TVec< char * > &LineV, const bool &SkipEmpty=false)
 
static int SplitSentences (TChA &ChA, TVec< char * > &SentenceV)
 
static void RemoveHtmlTags (const TChA &HtmlStr, TChA &TextStr)
 
static bool IsLatinStr (const TChA &Str, const double &MinAlFrac)
 
static void GetWIdV (const TStrHash< TInt > &StrH, const char *CStr, TIntV &WIdV)
 
static void GetAddWIdV (TStrHash< TInt > &StrH, const char *CStr, TIntV &WIdV)
 
static bool GetTmFromStr (const char *TmStr, TSecTm &Tm)
 Parses time in many different text formats. See source code for details. More...
 
static TStr GetStdName (TStr AuthorName)
 Puts person's name (fist middle last) in a standard form: <last_name>_<first name innitial> More...
 
static void GetStdNameV (TStr AuthorNames, TStrV &StdNameV)
 Splits a list of people's names. More...
 

Detailed Description

String helper functions and utilities. Quick and ditry!

Definition at line 34 of file util.h.

Member Function Documentation

int TStrUtil::CountWords ( const char *  CStr)
static

Definition at line 393 of file util.cpp.

393  {
394  int WrdCnt = 1;
395  for (const char *c = CStr; *c; c++) {
396  if (TCh::IsWs(*c)) { WrdCnt++; }
397  }
398  return WrdCnt;
399 }
static bool IsWs(const char &Ch)
Definition: dt.h:970
int TStrUtil::CountWords ( const TChA ChA)
static

Definition at line 389 of file util.cpp.

389  {
390  return CountWords(ChA.CStr());
391 }
char * CStr()
Definition: dt.h:255
static int CountWords(const char *CStr)
Definition: util.cpp:393
int TStrUtil::CountWords ( const TChA ChA,
const TStrHash< TInt > &  StopWordH 
)
static

Definition at line 401 of file util.cpp.

401  {
402  TChA Tmp;
403  TVec<char *> WrdV;
404  SplitWords(Tmp, WrdV);
405  int SWordCnt = 0;
406  for (int w = 0; w < WrdV.Len(); w++) {
407  if (StopWordH.IsKey(WrdV[w])) { SWordCnt++; }
408  }
409  return WrdV.Len() - SWordCnt;
410 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
bool IsKey(const char *Key) const
Definition: hash.h:825
static int SplitWords(TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
Definition: util.cpp:412
Definition: dt.h:201
void TStrUtil::GetAddWIdV ( TStrHash< TInt > &  StrH,
const char *  CStr,
TIntV WIdV 
)
static

Definition at line 552 of file util.cpp.

552  {
553  TChA ChA(CStr);
554  TVec<char *> WrdV;
555  TInt WId;
556  TStrUtil::SplitWords(ChA, WrdV);
557  WIdV.Clr(false);
558  for (int w = 0; w < WrdV.Len(); w++) {
559  WIdV.Add(StrH.AddDatId(WrdV[w]));
560  }
561 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TDat & AddDatId(const char *Key)
Definition: hash.h:786
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
static int SplitWords(TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
Definition: util.cpp:412
Definition: dt.h:1044
Definition: dt.h:201
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
TChA TStrUtil::GetCleanStr ( const TChA ChA)
static

Definition at line 372 of file util.cpp.

372  {
373  char *b = (char *) ChA.CStr();
374  while (*b && ! TCh::IsAlNum(*b)) { b++; }
375  if (*b == 0) { return TChA(); }
376  TChA OutChA(ChA.Len());
377  char *e = b;
378  bool ws=false;
379  while (*e) {
380  while (*e && TCh::IsWs(*e)) { e++; ws=true; }
381  if (! *e) { break; }
382  if (ws) { OutChA.AddCh(' '); ws=false; }
383  OutChA.AddCh(*e);
384  e++;
385  }
386  //OutChA.ToLc();
387  return OutChA;
388 }
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
int Len() const
Definition: dt.h:259
static bool IsWs(const char &Ch)
Definition: dt.h:970
char * CStr()
Definition: dt.h:255
Definition: dt.h:201
static bool IsAlNum(const char &Ch)
Definition: dt.h:975
TChA TStrUtil::GetCleanWrdStr ( const TChA ChA)
static

Definition at line 350 of file util.cpp.

350  {
351  char *b = (char *) ChA.CStr();
352  while (*b && ! TCh::IsAlNum(*b)) { b++; }
353  if (*b == 0) { return TChA(); }
354  TChA OutChA(ChA.Len());
355  char *e = b, tmp;
356  while (*e) {
357  b = e;
358  while (*e && (TCh::IsAlNum(*e) || ((*e=='\'' || *e=='-') && TCh::IsAlNum(*(e+1))))) { e++; }
359  if (b < e) {
360  tmp = *e; *e=0;
361  OutChA += b; OutChA.AddCh(' ');
362  *e = tmp;
363  }
364  while (*e && ! TCh::IsAlNum(*e)) { e++; }
365  if (! *e) { break; }
366  }
367  OutChA.DelLastCh(); OutChA.ToLc();
368  return OutChA;
369 }
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
int Len() const
Definition: dt.h:259
char * CStr()
Definition: dt.h:255
Definition: dt.h:201
static bool IsAlNum(const char &Ch)
Definition: dt.h:975
TChA TStrUtil::GetDomNm ( const TChA UrlChA)
static

Definition at line 187 of file util.cpp.

187  {
188  int EndSlash = UrlChA.SearchCh('/', 7)-1; // skip starting http://
189  if (EndSlash > 0) {
190  const int BegSlash = UrlChA.SearchChBack('/', EndSlash);
191  if (BegSlash > 0) { return UrlChA.GetSubStr(BegSlash+1, EndSlash).ToLc(); }
192  else { return UrlChA.GetSubStr(0, UrlChA.SearchCh('/', 0)-1).ToLc(); }
193  } else {
194  if (UrlChA.IsPrefix("http://")) { return UrlChA.GetSubStr(7, UrlChA.Len()-1).ToLc(); }
195  EndSlash = UrlChA.SearchCh('/', 0);
196  if (EndSlash > 0) { return UrlChA.GetSubStr(0, EndSlash-1).ToLc(); }
197  else { return TChA(UrlChA).ToLc(); }
198  }
199 }
int Len() const
Definition: dt.h:259
int SearchChBack(const char &Ch, int BChN=-1) const
Definition: dt.cpp:477
bool IsPrefix(const char *CStr, const int &BChN=0) const
Definition: dt.cpp:499
TChA GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:448
TChA & ToLc()
Definition: dt.cpp:552
Definition: dt.h:201
int SearchCh(const char &Ch, const int &BChN=0) const
Definition: dt.cpp:470
TChA TStrUtil::GetDomNm2 ( const TChA UrlChA)
static

Definition at line 201 of file util.cpp.

201  {
202  TChA Dom = GetDomNm(UrlChA);
203  if (Dom.IsPrefix("www.")) { return Dom.GetSubStr(4, TInt::Mx); }
204  else { return Dom; }
205 }
static TChA GetDomNm(const TChA &UrlChA)
Definition: util.cpp:187
static const int Mx
Definition: dt.h:1049
bool IsPrefix(const char *CStr, const int &BChN=0) const
Definition: dt.cpp:499
TChA GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:448
Definition: dt.h:201
bool TStrUtil::GetNormalizedUrl ( const TChA UrlIn,
const TChA BaseUrl,
TChA UrlOut 
)
static

Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www.

Definition at line 306 of file util.cpp.

306  {
307  UrlOut = UrlIn;
308  if (StripEnd(UrlIn, "/", UrlOut)) {}
309  else if (StripEnd(UrlIn, "/index.html", UrlOut)) {}
310  else if (StripEnd(UrlIn, "/index.htm", UrlOut)) {}
311  else if (StripEnd(UrlIn, "/index.php", UrlOut)) {}
312  if (! (UrlOut.IsPrefix("http://") || UrlOut.IsPrefix("ftp://"))) {
313  // if UrlIn is relative url, try combine it with BaseUrl
314  if (UrlIn.Empty() || ! (BaseUrl.IsPrefix("http://") || BaseUrl.IsPrefix("ftp://"))) {
315  //printf("** Bad URL: base:'%s' url:'%s'\n", BaseUrl.CStr(), UrlIn.CStr());
316  return false; }
317  TChA Out;
318  if (! GetNormalizedUrl(BaseUrl, TChA(), Out)) { return false; }
319  if (UrlIn[0] != '/') { Out.AddCh('/'); }
320  Out += UrlOut;
321  UrlOut = Out;
322  }
323  // http://www. --> http://
324  if (UrlOut.IsPrefix("http://www.")) {
325  UrlOut = TChA("http://") + UrlOut.GetSubStr(11, TInt::Mx);
326  }
327  UrlOut.ToLc();
328  return true;
329 }
static bool GetNormalizedUrl(const TChA &UrlIn, const TChA &BaseUrl, TChA &UrlOut)
Quick URL nomalization: Remove ending /, /index.html, etc. and strip starting www.
Definition: util.cpp:306
bool Empty() const
Definition: dt.h:260
static const int Mx
Definition: dt.h:1049
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
bool IsPrefix(const char *CStr, const int &BChN=0) const
Definition: dt.cpp:499
TChA GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:448
TChA & ToLc()
Definition: dt.cpp:552
Definition: dt.h:201
static bool StripEnd(const TChA &Str, const TChA &SearchStr, TChA &NewStr)
Definition: util.cpp:331
TChA TStrUtil::GetShorStr ( const TChA LongStr,
const int  MaxLen = 50 
)
static

Definition at line 342 of file util.cpp.

342  {
343  if (LongStr.Len() < MaxLen) { return LongStr; }
344  TChA Str = LongStr.GetSubStr(0, MaxLen-1);
345  Str += "...";
346  return Str;
347 }
int Len() const
Definition: dt.h:259
TChA GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:448
Definition: dt.h:201
TStr TStrUtil::GetStdName ( TStr  AuthorName)
static

Puts person's name (fist middle last) in a standard form: <last_name>_<first name innitial>

Definition at line 621 of file util.cpp.

621  {
622  TStr StdName;
623  AuthorName.ToLc();
624  AuthorName.ChangeChAll('\n', ' ');
625  AuthorName.ChangeChAll('.', ' ');
626  // if there is a number in the name, remove it and everything after it
627  int i, pos = 0;
628  while (pos<AuthorName.Len() && (AuthorName[pos]!='#' && !TCh::IsNum(AuthorName[pos]))) {
629  pos++; }
630  if (pos < AuthorName.Len()) {
631  AuthorName = AuthorName.GetSubStr(0, pos-1).ToTrunc(); }
632  if (AuthorName.Empty()) { return TStr::GetNullStr(); }
633 
634  // replace everything after '('
635  int b = AuthorName.SearchCh('(');
636  if (b != -1) {
637  AuthorName = AuthorName.GetSubStr(0, b-1).ToTrunc(); }
638  // skip if contains ')'
639  if (AuthorName .SearchCh(')')!=-1) { return TStr::GetNullStr(); }
640  // skip if it is not a name
641  if (AuthorName .SearchStr("figures")!=-1 || AuthorName .SearchStr("macros")!=-1
642  || AuthorName .SearchStr("univ")!=-1 || AuthorName .SearchStr("institute")!=-1) {
643  return TStr::GetNullStr();
644  }
645  // remove all non-letters (latex tags, ...)
646  TChA NewName;
647  for (i = 0; i < AuthorName.Len(); i++) {
648  const char Ch = AuthorName[i];
649  if (TCh::IsAlpha(Ch) || TCh::IsWs(Ch) || Ch=='-') { NewName += Ch; }
650  }
651  StdName = NewName; StdName.ToTrunc();
652  TStrV AuthNmV; StdName.SplitOnWs(AuthNmV);
653  // too short -- not a name
654  if (! AuthNmV.Empty() && AuthNmV.Last() == "jr") AuthNmV.DelLast();
655  if (AuthNmV.Len() < 2) return TStr::GetNullStr();
656 
657  const TStr LastNm = AuthNmV.Last();
658  if (! TCh::IsAlpha(LastNm[0]) || LastNm.Len() == 1) return TStr::GetNullStr();
659 
660  IAssert(isalpha(AuthNmV[0][0]));
661  return TStr::Fmt("%s_%c", LastNm.CStr(), AuthNmV[0][0]);
662 }
#define IAssert(Cond)
Definition: bd.h:262
int SearchCh(const char &Ch, const int &BChN=0) const
Definition: dt.cpp:1043
int Len() const
Definition: dt.h:487
static bool IsNum(const char &Ch)
Definition: dt.h:974
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
int ChangeChAll(const char &SrcCh, const char &DstCh)
Definition: dt.cpp:1113
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:542
static bool IsWs(const char &Ch)
Definition: dt.h:970
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:551
static TStr GetNullStr()
Definition: dt.cpp:1626
TStr & ToLc()
Definition: dt.cpp:758
Definition: dt.h:201
static bool IsAlpha(const char &Ch)
Definition: dt.h:972
Definition: dt.h:412
bool Empty() const
Definition: dt.h:488
TStr & ToTrunc()
Definition: dt.cpp:770
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
void SplitOnWs(TStrV &StrV) const
Definition: dt.cpp:972
char * CStr()
Definition: dt.h:476
void DelLast()
Removes the last element of the vector.
Definition: ds.h:635
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
void TStrUtil::GetStdNameV ( TStr  AuthorNames,
TStrV StdNameV 
)
static

Splits a list of people's names.

Definition at line 664 of file util.cpp.

664  {
665  AuthorNames.ChangeChAll('\n', ' ');
666  AuthorNames.ToLc();
667  // split into author names
668  TStrV AuthV, TmpV, Tmp2V;
669  // split on 'and'
670  AuthorNames.SplitOnStr(" and ", TmpV);
671  int i;
672  for (i = 0; i < TmpV.Len(); i++) {
673  TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); }
674  // split on '&'
675  TmpV = AuthV; AuthV.Clr();
676  for (i = 0; i < TmpV.Len(); i++) {
677  TmpV[i].SplitOnAllCh('&', Tmp2V); AuthV.AddV(Tmp2V); }
678  // split on ','
679  TmpV = AuthV; AuthV.Clr();
680  for (i = 0; i < TmpV.Len(); i++) {
681  TmpV[i].SplitOnAllCh(',', Tmp2V); AuthV.AddV(Tmp2V); }
682  // split on ';'
683  TmpV = AuthV; AuthV.Clr();
684  for (i = 0; i < TmpV.Len(); i++) {
685  TmpV[i].SplitOnAllCh(';', Tmp2V); AuthV.AddV(Tmp2V); }
686  // standardize names
687  StdNameV.Clr();
688  //printf("\n*** %s\n", AuthorNames.CStr());
689  for (i = 0; i < AuthV.Len(); i++) {
690  TStr StdName = GetStdName(AuthV[i]);
691  if (! StdName.Empty()) {
692  //printf("\t%s ==> %s\n", AuthV[i].CStr(), StdName.CStr());
693  StdNameV.Add(StdName);
694  }
695  }
696 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
int ChangeChAll(const char &SrcCh, const char &DstCh)
Definition: dt.cpp:1113
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
TStr & ToLc()
Definition: dt.cpp:758
static TStr GetStdName(TStr AuthorName)
Puts person's name (fist middle last) in a standard form: _ ...
Definition: util.cpp:621
Definition: dt.h:412
bool Empty() const
Definition: dt.h:488
void SplitOnStr(const TStr &SplitStr, TStrV &StrV) const
Definition: dt.cpp:1008
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1056
bool TStrUtil::GetTmFromStr ( const char *  TmStr,
TSecTm Tm 
)
static

Parses time in many different text formats. See source code for details.

Definition at line 571 of file util.cpp.

571  {
572  static TStrV MonthV1, MonthV2;
573  if (MonthV1.Empty()) {
574  TStr("january|february|march|april|may|june|july|august|september|october|november|december").SplitOnAllCh('|', MonthV1);
575  TStr("jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec").SplitOnAllCh('|', MonthV2);
576  }
577  TChA Tmp(TmStr);
578  Tmp.ToLc();
579  TVec<char *> WrdV;
580  const char* End = Tmp.CStr()+Tmp.Len();
581  int Col = -1, Cols=0;
582  for (char *b = Tmp.CStr(); b <End; ) {
583  WrdV.Add(b);
584  while (*b && ! (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
585  if (*b==':') { if(Col==-1) { Col=WrdV.Len(); } Cols++; }
586  *b=0; b++;
587  while (*b && (*b==' ' || *b=='-' || *b==':' || *b==',')) { b++; }
588  }
589  if (Cols == 2) {
590  if (Col+1 >= WrdV.Len()) { return false; }
591  WrdV.Del(Col+1);
592  }
593  if (Col<1) { return false; }
594  const int Hr = atoi(WrdV[Col-1]);
595  const int Min = atoi(WrdV[Col]);
596  WrdV.Del(Col); WrdV.Del(Col-1);
597  if (WrdV.Len() != 3) { return false; }
598  int y=0,m=1,d=2, Mon=-1;
599  if (TCh::IsAlpha(WrdV[0][0])) {
600  y=2; m=0; d=1;
601  } else if (TCh::IsAlpha(WrdV[1][0])) {
602  y=2; m=1; d=0;
603  } else if (TCh::IsAlpha(WrdV[2][0])) {
604  y=0; m=2; d=1;
605  } else {
606  y=0; m=1; d=2;
607  Mon = atoi(WrdV[m]);
608  }
609  int Day = atoi(WrdV[d]);
610  if (Mon <= 0) { Mon = MonthV1.SearchForw(WrdV[m])+1; }
611  if (Mon <= 0) { Mon = MonthV2.SearchForw(WrdV[m])+1; }
612  if (Mon == 0) { return false; }
613  int Year = atoi(WrdV[y]);
614  if (Day > Year) { ::Swap(Day, Year); }
615  //printf("%d-%02d-%02d %02d:%02d\n", Year, Mon, Day, Hr, Min);
616  Tm = TSecTm(Year, Mon, Day, Hr, Min, 0);
617  return true;
618 }
void Del(const TSizeTy &ValN)
Removes the element at position ValN.
Definition: ds.h:1130
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:542
Definition: dt.h:201
Definition: tm.h:81
static bool IsAlpha(const char &Ch)
Definition: dt.h:972
Definition: dt.h:412
void SplitOnAllCh(const char &SplitCh, TStrV &StrV, const bool &SkipEmpty=true) const
Definition: dt.cpp:926
TSizeTy SearchForw(const TVal &Val, const TSizeTy &BValN=0) const
Returns the position of an element with value Val.
Definition: ds.h:1487
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void Swap(TRec &Rec1, TRec &Rec2)
Definition: bd.h:568
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:429
TChA TStrUtil::GetWebsiteNm ( const TChA UrlChA)
static

Definition at line 218 of file util.cpp.

218  {
219  TChA DomNm = TStrUtil::GetDomNm2(PostUrlStr);
220  // http://blog.myspace.com/index.cfm?fuseaction=blog.view&friendid=141560&blogid=420009539
221  if (DomNm == "blog.myspace.com") {
222  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 2, '&')-1);
223  }
224  // For these websites take the domain name and 1st directory: http://blogs.msdn.com/squasta
225  // http://blogs.msdn.com/squasta/archive/2008/08/11/annonces-microsoft-au-black-hat-2008.aspx
226  // http://ameblo.jp/baptism/entry-10126216277.html
227  // http://xfruits.com/fcuignet/?id=8793&clic=249862689&url=http%3a%2f%2fnews.google.com%2fnews%2furl%3fsa%3dt%26ct%3dfr%2f9-0%26fd%3dr%26url%3dhttp%3a%2f%2fwww.investir-en-tunisie.net%2fnews%2farticle.php%253fid%253d5026%26cid%3d1241943065%26ei%3doy6gslh9jzycxahkjfxucw%26usg%3dafqjcnen_bczqldodsyga6zps2axphxl3q
228  // http://scienceblogs.com/grrlscientist/2008/08/reader_comments.php
229  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
230  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
231  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
232  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
233  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
234  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
235  // http://blogs.sun.com/geertjan/entry/wicket_in_action_undoubtedly_the
236  // http://blog.wired.com/gadgets/2008/08/apple-sells-60.html
237  // http://weblogs.asp.net/mehfuzh/archive/2008/08/11/linqextender-1-4-enhanced-object-tracking.aspx
238  // http://blogs.technet.com/plitpromicrosoftcom/archive/2008/08/11/nowa-karta-sim.aspx
239  // http://blogs.guardian.co.uk/greenslade/2008/08/murdoch_aims_to_boost_subscrib.html
240  // http://blogs.clarin.com/quimeykiltru/2008/8/11/mentira-mentira-creo
241  // http://blogs.zdnet.com/hardware/?p=2391
242  // http://blogs.citypages.com/sports/2008/08/ufc_87_seek_and.php
243  // http://voices.washingtonpost.com/achenblog/2008/08/no_medal_for_bush.html
244  // http://blog.tv2.dk/ole.mork/entry254689.html
245  // http://blogs.menomoneefallsnow.com/in_the_race/archive/2008/08/11/sometimes-it-s-about-how-you-play-the-game.asp
246  // http://weblogs.baltimoresun.com/entertainment/midnight_sun/blog/2008/08/heidis_bad_break_with_dubai_pa.html
247  // http://eonline.com/uberblog/b23076_youtubular_from_rickrolled_barackrolled.html?sid=rss_topstories&utm_source=eo
248  if (DomNm=="blogs.msdn.com" || DomNm=="ameblo.jp" || DomNm=="xfruits.com" || DomNm=="scienceblogs.com" || DomNm=="blogs.sun.com"
249  || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net" || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.co"
250  || DomNm=="blogs.clarin.com" || DomNm=="blogs.sun.com" || DomNm=="blog.wired.com" || DomNm=="weblogs.asp.net"
251  || DomNm=="blogs.technet.com" || DomNm=="blogs.guardian.com" || DomNm=="blogs.clarin.com" || DomNm=="blogs.zdnet.com"
252  || DomNm=="blogs.citypages.com" || DomNm=="voices.washingtonpost.com" || DomNm=="blog.tv2.dk"
253  || DomNm=="blogs.menomoneefallsnow.com" || DomNm=="weblogs.baltimoresun.com" || DomNm=="eonline.com") {
254  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1);
255  }
256  // http://digg.com/submit?phase=2&amp;url=http://socialitelife.celebuzz.com/archive/2008/07/31/and_then_a_hero_came_along.php&amp;title=and
257  // http://digg.com/general_sciences/mental_images_are_like_pictures_slide_show
258  if (DomNm == "digg.com") {
259  if (PostUrlStr.IsPrefix("http://digg.com/submit?")) {
260  const int Url = PostUrlStr.SearchStr(";url=");
261  if (Url != -1) {
262  return GetWebsiteNm(PostUrlStr.GetSubStr(Url+5, PostUrlStr.SearchCh('&', Url+5))); }
263  } else {
264  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 4)-1); }
265  }
266  // For these websites take the domain name and 2 directories: http://bbc.co.uk/blogs/thereporters/
267  // http://bbc.co.uk/blogs/thereporters/markdevenport/2008/08/back_to_porridge.html
268  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
269  // http://newsbusters.org/blogs/p-j-gladnick/2008/08/11/sf-chronicle-writer-predicts-global-warming-shellfish-invas
270  // http://nydailynews.com/blogs/subwaysquawkers/2008/08/anaheim-is-no-magic-kingdom-fo.html
271  if (PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://bbc.co.uk/blogs/")
272  || PostUrlStr.IsPrefix("http://nydailynews.com/blogs/") || PostUrlStr.IsPrefix("http://newsbusters.org/blogs/")) {
273  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
274  }
275  // http://feeds.feedburner.com/~r/adesblog/ ~3/361711640
276  if (DomNm=="feeds.feedburner.com") {
277  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
278  }
279  // http://groups.google.com/group/news.admin.net-abuse.sightings/browse_thread/thread/8452c47949453216/f07daa509b90295c?show_docid=f07daa509b90295c
280  if (DomNm=="groups.google.com") {
281  return PostUrlStr.GetSubStr(7, GetNthOccurence(PostUrlStr, 5)-1);
282  }
283  // http://news.google.com/news/url?sa=t&ct=us/20-0&fd=r&url=http://www.theobserver.ca/articledisplay.aspx%3fe%3d1151495&cid=0&ei=yswgsjpndpbi8atc9knacw&usg=afqjcnhrbg-nc9z6ymtqfkear3_npwqqxa
284  if (DomNm=="news.google.com") { // redirect
285  const int UrlPos = PostUrlStr.SearchStr("&url=");
286  if (UrlPos != -1) {
287  return GetWebsiteNm(PostUrlStr.GetSubStr(UrlPos+5, PostUrlStr.SearchCh('&', UrlPos+5))); }
288  }
289  // http://bloggrevyen.no/go/110340/http://blog.christergulbrandsen.com/2008/08/11/is-nationalism-the-only-way-to-de
290  if (DomNm == "bloggrevyen.no") { // redirect
291  const int Http2 = PostUrlStr.SearchStr("/http://");
292  if (Http2!=-1) {
293  return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+1, PostUrlStr.Len()-1)); }
294  }
295  //http://us.rd.yahoo.com/dailynews/rss/search/urgent+care/sig=11phgb4tu/*http%3a//www.newswise.com/articles/view/543340/?sc=rsmn
296  //http://ca.rd.yahoo.com/dailynews/rss/topstories/*http://ca.news.yahoo.com/s/reuters/080801/n_top_news/news_afgha
297  if (DomNm.IsSuffix(".rd.yahoo.com")) {
298  const int Http2 = PostUrlStr.SearchStr("/*");
299  if (Http2!=-1) {
300  return GetWebsiteNm(PostUrlStr.GetSubStr(Http2+9, PostUrlStr.Len()-1)); }
301  }
302  return DomNm;
303 }
static TChA GetWebsiteNm(const TChA &UrlChA)
Definition: util.cpp:218
static TChA GetDomNm2(const TChA &UrlChA)
Definition: util.cpp:201
Definition: dt.h:201
bool IsSuffix(const char *CStr) const
Definition: dt.cpp:518
int GetNthOccurence(const TChA &Url, const int &Count, const char Ch='/')
Definition: util.cpp:207
void TStrUtil::GetWIdV ( const TStrHash< TInt > &  StrH,
const char *  CStr,
TIntV WIdV 
)
static

Definition at line 538 of file util.cpp.

538  {
539  const int NotWId = -1;
540  TChA ChA(CStr);
541  TVec<char *> WrdV;
542  TInt WId;
543  TStrUtil::SplitWords(ChA, WrdV);
544  WIdV.Clr(false);
545  for (int w = 0; w < WrdV.Len(); w++) {
546  if (StrH.IsKeyGetDat(WrdV[w], WId)) { WIdV.Add(WId); }
547  else { WIdV.Add(NotWId); }
548  }
549 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
static int SplitWords(TChA &ChA, TVec< char * > &WrdV, const bool &SplitOnWs=true)
Definition: util.cpp:412
Definition: dt.h:1044
Definition: dt.h:201
bool IsKeyGetDat(const char *Key, TDat &Dat) const
Definition: hash.h:829
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void TStrUtil::GetXmlTagNmVal ( TXmlLx XmlLx,
TChA TagNm,
TChA TagVal 
)
static

Definition at line 149 of file util.cpp.

149  {
150  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
151  TagNm = XmlLx.TagNm;
152  const TXmlLxSym NextSym = XmlLx.GetSym();
153  TagVal = XmlLx.TxtChA;
154  if (NextSym == xsyStr) {
155  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
156  } else {
157  EAssertR(NextSym == xsyETag, TagNm); // empty tag
158  //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
159  }
160 }
TXmlLxSym GetSym()
Definition: xml.cpp:757
TStr TagNm
Definition: xml.h:141
TChA TxtChA
Definition: xml.h:140
Definition: xml.h:93
Definition: xml.h:93
Definition: xml.h:93
TXmlLxSym
Definition: xml.h:89
#define EAssertR(Cond, MsgStr)
Definition: bd.h:283
bool TStrUtil::GetXmlTagNmVal2 ( TXmlLx XmlLx,
TChA TagNm,
TChA TagVal,
const bool &  TakeTagNms 
)
static

Definition at line 163 of file util.cpp.

163  {
164  if (XmlLx.GetSym() != xsySTag) {
165  return false; }
166  TagVal.Clr();
167  TagNm = XmlLx.TagNm;
168  //const TXmlLxSym NextSym = XmlLx.GetSym();
169  while (XmlLx.Sym != xsyETag || XmlLx.TagNm != TagNm.CStr()) {
170  if (TakeTagNms) {
171  TagVal += XmlLx.TxtChA; }
172  else if (XmlLx.Sym == xsyStr) {
173  TagVal += XmlLx.TxtChA; }
174  XmlLx.GetSym();
175  }
176  return true;
177  //if (NextSym == xsyStr) {
178  // EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
179  //} else {
180  // EAssertR(NextSym == xsyETag, TagNm); // empty tag
181  // printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
182  //}
183 }
TXmlLxSym GetSym()
Definition: xml.cpp:757
TStr TagNm
Definition: xml.h:141
TChA TxtChA
Definition: xml.h:140
void Clr()
Definition: dt.h:258
char * CStr()
Definition: dt.h:255
Definition: xml.h:93
Definition: xml.h:93
Definition: xml.h:93
TXmlLxSym Sym
Definition: xml.h:139
TChA & TStrUtil::GetXmlTagVal ( TXmlLx XmlLx,
const TChA TagNm 
)
static

Definition at line 132 of file util.cpp.

132  {
133  static TChA TagVal;
134  EAssertR(XmlLx.GetSym() == xsySTag, TagNm);
135  EAssertR(TagNm == XmlLx.TagNm.CStr(), TagNm);
136  const TXmlLxSym NextSym = XmlLx.GetSym();
137  TagVal = XmlLx.TxtChA;
138  if (NextSym == xsyStr) {
139  EAssertR(XmlLx.GetSym() == xsyETag, TagNm);
140  } else {
141  EAssertR(NextSym == xsyETag, TagNm); // empty tag
142  //printf(" token: %s empty! %s\n", XmlLx.TagNm.CStr(), XmlLx.GetFPosStr().CStr());
143  }
144  EAssertR(XmlLx.TagNm == TagNm, TagNm);
145  return TagVal;
146 }
TXmlLxSym GetSym()
Definition: xml.cpp:757
TStr TagNm
Definition: xml.h:141
TChA TxtChA
Definition: xml.h:140
Definition: xml.h:93
Definition: xml.h:93
Definition: xml.h:93
Definition: dt.h:201
TXmlLxSym
Definition: xml.h:89
#define EAssertR(Cond, MsgStr)
Definition: bd.h:283
char * CStr()
Definition: dt.h:476
bool TStrUtil::IsLatinStr ( const TChA Str,
const double &  MinAlFrac 
)
static

Definition at line 527 of file util.cpp.

527  {
528  int AlNumCnt=0, ChCnt=0;
529  for (const char *c = Str.CStr(); *c; c++) {
530  if (TCh::IsWs(*c)) { continue; }
531  if (*c > 0 && TCh::IsAlNum(*c)) { AlNumCnt++; }
532  ChCnt++;
533  }
534  if (double(AlNumCnt)/double(ChCnt) > MinAlFrac) { return true; }
535  return false;
536 }
static bool IsWs(const char &Ch)
Definition: dt.h:970
char * CStr()
Definition: dt.h:255
static bool IsAlNum(const char &Ch)
Definition: dt.h:975
void TStrUtil::RemoveHtmlTags ( const TChA HtmlStr,
TChA TextStr 
)
static

Definition at line 481 of file util.cpp.

481  {
482  TextStr.Clr();
483  char *StrB, *StrE;
484  // use full page html: skip till <body>
485  //PageHtmlStr = "<script fdsfs> fsdfsd </script> jure";
486  /*if (UseFullHtml) {
487  StrB = PageHtmlStr.CStr();
488  StrE = StrB+PageHtmlStr.Len();
489  char * NewB = strstr(StrB, "<body>");
490  if (NewB != NULL) { StrB = NewB+6; }
491  char * NewE = strstr(StrB, "body>");
492  if (NewE != NULL) {
493  while (true) {
494  char *E=strstr(NewE+4, "body>");
495  if (E == NULL) { break; } NewE = E; }
496  StrE = NewE;
497  }
498  } else { // only extracted post html*/
499  StrB = (char *) HtmlStr.CStr();
500  StrE = (char *) StrB+HtmlStr.Len(); //}
501  for (char *e = StrB; e < StrE; ) {
502  char* b = e;
503  while (e<StrE && *e != '<') { e++; }
504  // copy text
505  char tmp=*e; *e = 0;
506  TextStr+= b; TextStr.AddCh(' '); *e = tmp;
507  if (e >= StrE) { return; }
508  // if start of a comment: skip
509  if (e[1]=='!' && e[2]=='-' && e[3]=='-') { // comment
510  e += 3;
511  while(e<StrE && !(*(e-2)=='-' && *(e-1)=='-' && *e=='>')) { e++; }
512  e++; continue;
513  }
514  // if "<script" then skip
515  if (e[1]=='s' && e[2]=='c' && e[3]=='r' && e[4]=='i' && e[5]=='p' && e[6]=='t') {
516  e += 5;
517  while(e<StrE && !(*(e-6)=='s' && *(e-5)=='c' && *(e-4)=='r' && *(e-3)=='i' && *(e-2)=='p' && *(e-1)=='t' && *e=='>')) { e++; }
518  e++; continue;
519  }
520  // skip to end of tag
521  while (e < StrE && *e != '>') { e++; }
522  if (e>=StrE) { return; }
523  e++;
524  }
525 }
void Clr()
Definition: dt.h:258
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
int Len() const
Definition: dt.h:259
char * CStr()
Definition: dt.h:255
int TStrUtil::SplitLines ( TChA ChA,
TVec< char * > &  LineV,
const bool &  SkipEmpty = false 
)
static

Definition at line 439 of file util.cpp.

439  {
440  LineV.Clr(false);
441  LineV.Add(ChA.CStr());
442  bool IsChs=false;
443  for (char *c = (char *) ChA.CStr(); *c; c++) {
444  if (*c == '\n') {
445  if (c > ChA.CStr() && *(c-1)=='\r') { *(c-1)=0; } // \r\n
446  *c=0;
447  if (SkipEmpty) {
448  if (IsChs) { LineV.Add(c+1); }
449  } else {
450  LineV.Add(c+1);
451  }
452  IsChs=false;
453  } else {
454  IsChs=true;
455  }
456  }
457  return LineV.Len();
458 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
char * CStr()
Definition: dt.h:255
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
int TStrUtil::SplitOnCh ( TChA ChA,
TVec< char * > &  WrdV,
const char &  Ch,
const bool &  SkipEmpty = false 
)
static

Definition at line 425 of file util.cpp.

425  {
426  WrdV.Clr(false);
427  WrdV.Add(ChA.CStr());
428  for (char *c = (char *) ChA.CStr(); *c; c++) {
429  if (*c == Ch) {
430  *c = 0;
431  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
432  WrdV.Add(c+1);
433  }
434  }
435  if (SkipEmpty && ! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
436  return WrdV.Len();
437 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:542
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
char * CStr()
Definition: dt.h:255
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:551
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void DelLast()
Removes the last element of the vector.
Definition: ds.h:635
int TStrUtil::SplitSentences ( TChA ChA,
TVec< char * > &  SentenceV 
)
static

Definition at line 460 of file util.cpp.

460  {
461  SentenceV.Clr();
462  const char *B = ChA.CStr();
463  const char *E = B+ChA.Len();
464  char *c = (char *) B;
465  while (*c && TCh::IsWs(*c)) { c++; }
466  if (*c) { SentenceV.Add(c); } else { return 0; }
467  for (; c < E; c++) {
468  if (c<E && (*c == '.' || *c == '!' || *c == '?') && ! TCh::IsAlNum(*(c+1))) { // end of sentence
469  if (c<E && *(c+1)=='"') { *c='"'; c++; } // blah." --> blah"
470  if (c>=E) { continue; }
471  *c=0; c++;
472  char *e = c-1;
473  while (e>B && *e!='"' && ! TCh::IsAlNum(*e)) { *e=0; e--; } // skip trailing non-alpha-num chars
474  while (c<E && ! (TCh::IsAlNum(*c) || (*c=='"' && TCh::IsAlNum(*(c+1))))) { c++; } // sentence starts with AlNum or "AlNum
475  if (c<E) { SentenceV.Add(c); }
476  }
477  }
478  return SentenceV.Len();
479 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
int Len() const
Definition: dt.h:259
static bool IsWs(const char &Ch)
Definition: dt.h:970
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
char * CStr()
Definition: dt.h:255
static bool IsAlNum(const char &Ch)
Definition: dt.h:975
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
int TStrUtil::SplitWords ( TChA ChA,
TVec< char * > &  WrdV,
const bool &  SplitOnWs = true 
)
static

Definition at line 412 of file util.cpp.

412  {
413  WrdV.Clr(false);
414  WrdV.Add(ChA.CStr());
415  for (char *c = (char *) ChA.CStr(); *c; c++) {
416  if ((SplitOnWs && *c == ' ') || (! SplitOnWs && ! TCh::IsAlNum(*c))) {
417  *c = 0;
418  if (! WrdV.Empty() && strlen(WrdV.Last()) == 0) { WrdV.DelLast(); }
419  WrdV.Add(c+1);
420  }
421  }
422  return WrdV.Len();
423 }
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:542
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
char * CStr()
Definition: dt.h:255
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:551
static bool IsAlNum(const char &Ch)
Definition: dt.h:975
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void DelLast()
Removes the last element of the vector.
Definition: ds.h:635
bool TStrUtil::StripEnd ( const TChA Str,
const TChA SearchStr,
TChA NewStr 
)
static

Definition at line 331 of file util.cpp.

331  {
332  const int StrLen = Str.Len();
333  const int SearchStrLen = SearchStr.Len();
334  if (StrLen < SearchStrLen) { return false; }
335  for (int i = 0; i < SearchStrLen; i++) {
336  if (Str[StrLen-i-1] != SearchStr[SearchStrLen-i-1]) { return false; }
337  }
338  NewStr = Str.GetSubStr(0, StrLen-SearchStrLen-1);
339  return true;
340 }
int Len() const
Definition: dt.h:259
TChA GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:448

The documentation for this class was generated from the following files: