SNAP Library, Developer Reference  2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
html.h
Go to the documentation of this file.
00001 
00002 // Forward
00003 ClassHdTP(THtmlTok, PHtmlTok)
00004 ClassHdTP(THtmlDoc, PHtmlDoc)
00005 
00007 // Html-Lexical-Chars
00008 typedef enum {
00009   hlctSpace, hlctAlpha, hlctNum, hlctSym,
00010   hlctLTag, hlctRTag, hlctEof} THtmlLxChTy;
00011 
00012 ClassTP(THtmlLxChDef, PHtmlLxChDef)//{
00013 private:
00014   TIntV ChTyV;
00015   TChV UcChV;
00016   TChV LcChV;
00017   TStrStrH EscStrH;
00018   void SetUcCh(const char& UcCh, const char& LcCh);
00019   void SetUcCh(const TStr& Str);
00020   void SetChTy(const THtmlLxChTy& ChTy, const TStr& Str);
00021   void SetEscStr(const TStr& SrcStr, const TStr& DstStr);
00022 public:
00023   THtmlLxChDef();
00024   THtmlLxChDef(TSIn& SIn): ChTyV(SIn), UcChV(SIn), LcChV(SIn), EscStrH(SIn){}
00025   static PHtmlLxChDef Load(TSIn& SIn){return new THtmlLxChDef(SIn);}
00026   void Save(TSOut& SOut){
00027     ChTyV.Save(SOut); UcChV.Save(SOut); LcChV.Save(SOut); EscStrH.Save(SOut);}
00028 
00029   THtmlLxChDef& operator=(const THtmlLxChDef&){Fail; return *this;}
00030 
00031   // character type operations
00032   int GetChTy(const char& Ch) const {return ChTyV[Ch-TCh::Mn];}
00033   bool IsEoln(const char& Ch) const {return (Ch==TCh::CrCh)||(Ch==TCh::LfCh);}
00034   bool IsWs(const char& Ch) const {
00035     return (Ch==' ')||(Ch==TCh::TabCh)||(Ch==TCh::CrCh)||(Ch==TCh::LfCh);}
00036   bool IsSpace(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSpace;}
00037   bool IsAlpha(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctAlpha;}
00038   bool IsNum(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctNum;}
00039   bool IsAlNum(const char& Ch) const {
00040     return (int(ChTyV[Ch-TCh::Mn])==hlctAlpha)||(int(ChTyV[Ch-TCh::Mn])==hlctNum);}
00041   bool IsSym(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSym;}
00042   bool IsUrl(const char& Ch) const {
00043     int ChTy=ChTyV[Ch-TCh::Mn];
00044     return (ChTy==hlctAlpha)||(ChTy==hlctNum)||
00045      (Ch=='.')||(Ch=='-')||(Ch==':')||(Ch=='/')||(Ch=='~');}
00046 
00047   // upper/lower-case & escape-string operations
00048   bool IsUc(const char& Ch) const {return Ch==UcChV[Ch-TCh::Mn];}
00049   bool IsLc(const char& Ch) const {return Ch==LcChV[Ch-TCh::Mn];}
00050   char GetUc(const char& Ch) const {return UcChV[Ch-TCh::Mn];}
00051   char GetLc(const char& Ch) const {return LcChV[Ch-TCh::Mn];}
00052   void GetUcChA(TChA& ChA) const {
00053     for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetUc(ChA[ChN]));}}
00054   void GetLcChA(TChA& ChA) const {
00055     for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetLc(ChA[ChN]));}}
00056   TStr GetUcStr(const TStr& Str) const {
00057     TChA ChA(Str); GetUcChA(ChA); return ChA;}
00058   TStr GetLcStr(const TStr& Str) const {
00059     TChA ChA(Str); GetLcChA(ChA); return ChA;}
00060   TStr GetEscStr(const TStr& Str) const;
00061 
00062   // standard entry points
00063   static PHtmlLxChDef ChDef;
00064   static PHtmlLxChDef GetChDef(){IAssert(!ChDef.Empty()); return ChDef;}
00065   static THtmlLxChDef& GetChDefRef(){IAssert(!ChDef.Empty()); return *ChDef;}
00066 
00067   // character-set transformations
00068   static TStr GetCSZFromYuascii(const TChA& ChA);
00069   static TStr GetCSZFromWin1250(const TChA& ChA);
00070   static TStr GetWin1250FromYuascii(const TChA& ChA);
00071   static TStr GetIsoCeFromYuascii(const TChA& ChA);
00072 };
00073 
00075 // Html-Lexical
00076 typedef enum {
00077   hsyUndef, hsyStr, hsyNum, hsySSym, hsyUrl,
00078   hsyBTag, hsyETag, hsyMTag, hsyEof} THtmlLxSym;
00079 
00080 class THtmlLx{
00081 private:
00082   static THtmlLxChDef ChDef;
00083   PSIn SIn;
00084   TSIn& RSIn;
00085   bool DoParseArg;
00086   TChA ChStack;
00087   char Ch;
00088   int ChX;
00089   bool EscCh;
00090   TChA EscChA;
00091   TChA ArgNm;
00092   TChA ArgVal;
00093   void GetCh(){
00094     if (ChStack.Empty()){
00095       if (RSIn.Eof()){Ch=TCh::EofCh;} else {Ch=RSIn.GetCh(); ChX++;}
00096     } else {
00097       Ch=ChStack.Pop(); ChX++;
00098     }
00099     SymChA+=Ch;
00100   }
00101   void GetEscCh();
00102   void GetMetaTag();
00103   void GetTag();
00104 public:
00105   THtmlLxSym Sym;
00106   int SymBChX, SymEChX;
00107   TChA ChA;
00108   TChA UcChA;
00109   TChA SymChA;
00110   int PreSpaces;
00111   TChA PreSpaceChA;
00112   typedef TStrKdV TArgNmValV;
00113   TArgNmValV ArgNmValV;
00114 public:
00115   THtmlLx(const PSIn& _SIn, const bool& _DoParseArg=true):
00116     SIn(_SIn), RSIn(*SIn), DoParseArg(_DoParseArg),
00117     ChStack(), Ch(' '), ChX(0), EscCh(false),
00118     EscChA(), ArgNm(), ArgVal(),
00119     Sym(hsyUndef), SymBChX(0), SymEChX(0), ChA(), UcChA(),
00120     PreSpaces(0), PreSpaceChA(), ArgNmValV(){}
00121 
00122   THtmlLx& operator=(const THtmlLx&){Fail; return *this;}
00123 
00124   void PutCh(const char& _Ch){
00125     ChStack.Push(Ch); if (!SymChA.Empty()){SymChA.Pop();} Ch=_Ch; ChX--;}
00126   void PutStr(const TStr& Str){
00127     for (int ChN=Str.Len()-1; ChN>=0; ChN--){PutCh(Str[ChN]);}}
00128   THtmlLxSym GetSym();
00129   PHtmlTok GetTok(const bool& DoUc=true);
00130   TStr GetPreSpaceStr() const {
00131     return TStr::GetSpaceStr(PreSpaces);}
00132 
00133   int GetArgs() const {return ArgNmValV.Len();}
00134   TStr GetArgNm(const int& ArgN) const {return ArgNmValV[ArgN].Key;}
00135   TStr GetArgVal(const int& ArgN) const {return ArgNmValV[ArgN].Dat;}
00136   bool IsArg(const TStr& ArgNm) const {return ArgNmValV.IsIn(TStrKd(ArgNm));}
00137   TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal=TStr()) const {
00138     int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm));
00139     if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}}
00140   void PutArg(const TStr& ArgNm, const TStr& ArgVal){
00141     int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm));
00142     if (ArgN==-1){ArgNmValV.Add(TStrKd(ArgNm, ArgVal));}
00143     else {ArgNmValV[ArgN]=TStrKd(ArgNm, ArgVal);}}
00144   TStr GetFullBTagStr() const;
00145 
00146   void MoveToStrOrEof(const TStr& Str);
00147   void MoveToBTagOrEof(const TStr& TagNm);
00148   void MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2);
00149   void MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3);
00150   void MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm);
00151   void MoveToBTagArgOrEof(
00152    const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal);
00153   void MoveToBTagArg2OrEof(const TStr& TagNm,
00154    const TStr& ArgNm1, const TStr& ArgVal1,
00155    const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP=true);
00156   void MoveToBTagOrEof(
00157    const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1,
00158    const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2);
00159   void MoveToETagOrEof(const TStr& TagNm);
00160   TStr GetTextOnlyStrToEof();
00161   TStr GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP=false);
00162   TStr GetStrToBTag(const TStr& TagNm, const TStr& ArgNm,
00163    const TStr& ArgVal, const bool& TxtOnlyP=false);
00164   TStr GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP=false);
00165   TStr GetStrToETag2(const TStr& TagNm1, const TStr& TagNm2, const bool& TxtOnlyP=false);
00166   TStr GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP=false);
00167   TStr GetHRefBeforeStr(const TStr& Str);
00168   bool IsGetBTag(const TStr& TagNm);
00169   bool IsGetETag(const TStr& TagNm);
00170 
00171   static TStr GetSymStr(const THtmlLxSym& Sym);
00172   static TStr GetEscapedStr(const TChA& ChA);
00173   static TStr GetAsciiStr(const TChA& ChA, const char& GenericCh='_');
00174   static void GetTokStrV(const TStr& Str, TStrV& TokStrV);
00175   static TStr GetNoTag(const TStr& Str);
00176 };
00177 
00179 // Html-Token
00180 ClassTPV(THtmlTok, PHtmlTok, THtmlTokV)//{
00181 private:
00182   THtmlLxSym Sym;
00183   TStr Str;
00184   THtmlLx::TArgNmValV ArgNmValV;
00185 public:
00186   THtmlTok(): Sym(hsyUndef), Str(), ArgNmValV(){}
00187   THtmlTok(const THtmlLxSym& _Sym):
00188     Sym(_Sym), Str(), ArgNmValV(){}
00189   THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str):
00190     Sym(_Sym), Str(_Str), ArgNmValV(){}
00191   THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str,
00192    const THtmlLx::TArgNmValV& _ArgNmValV):
00193     Sym(_Sym), Str(_Str), ArgNmValV(_ArgNmValV){}
00194   THtmlTok(TSIn&){Fail;}
00195   static PHtmlTok Load(TSIn&){Fail; return NULL;}
00196   void Save(TSOut&){Fail;}
00197 
00198   THtmlTok& operator=(const THtmlTok&){Fail; return *this;}
00199 
00200   THtmlLxSym GetSym() const {return Sym;}
00201   TStr GetStr() const {return Str;}
00202   TStr GetFullStr() const;
00203   bool IsArg(const TStr& ArgNm) const {
00204     return ArgNmValV.SearchForw(TStrKd(ArgNm))!=-1;}
00205   TStr GetArg(const TStr& ArgNm) const {
00206     return ArgNmValV[ArgNmValV.SearchForw(TStrKd(ArgNm))].Dat;}
00207   TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal) const {
00208     int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm));
00209     if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}}
00210   bool IsUrlTok(TStr& RelUrlStr) const;
00211   bool IsRedirUrlTok() const;
00212 
00213   void SaveTxt(const PSOut& SOut, const bool& TxtMode=true);
00214 
00215   static const TStr ATagNm;
00216   static const TStr AreaTagNm;
00217   static const TStr BrTagNm;
00218   static const TStr CardTagNm;
00219   static const TStr CenterTagNm;
00220   static const TStr FrameTagNm;
00221   static const TStr H1TagNm;
00222   static const TStr H2TagNm;
00223   static const TStr H3TagNm;
00224   static const TStr H4TagNm;
00225   static const TStr H5TagNm;
00226   static const TStr H6TagNm;
00227   static const TStr ImgTagNm;
00228   static const TStr LiTagNm;
00229   static const TStr MetaTagNm;
00230   static const TStr PTagNm;
00231   static const TStr UlTagNm;
00232   static const TStr TitleTagNm;
00233   static const TStr TitleETagNm;
00234 
00235   static const TStr AltArgNm;
00236   static const TStr HRefArgNm;
00237   static const TStr SrcArgNm;
00238   static const TStr TitleArgNm;
00239   static const TStr HttpEquivArgNm;
00240 
00241   static bool IsBreakTag(const TStr& TagNm);
00242   static bool IsBreakTok(const PHtmlTok& Tok);
00243   static bool IsHTag(const TStr& TagNm, int& HTagN);
00244   static PHtmlTok GetHTok(const bool& IsBTag, const int& HTagN);
00245 };
00246 
00248 // Html-Document
00249 typedef enum {
00250   hdtAll, hdtStr, hdtStrNum, hdtTag, hdtA, hdtHRef, hdtUL} THtmlDocType;
00251 
00252 ClassTPV(THtmlDoc, PHtmlDoc, THtmlDocV)//{
00253 private:
00254   THtmlTokV TokV;
00255 public:
00256   THtmlDoc(): TokV(){}
00257   THtmlDoc(
00258    const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true);
00259   static PHtmlDoc New(
00260    const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){
00261     return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));}
00262   THtmlDoc(TSIn&){Fail;}
00263   static PHtmlDoc Load(TSIn&){Fail; return NULL;}
00264   void Save(TSOut&){Fail;}
00265 
00266   THtmlDoc& operator=(const THtmlDoc&){Fail; return *this;}
00267 
00268   int GetToks() const {return TokV.Len();}
00269   PHtmlTok GetTok(const int& TokN) const {return TokV[TokN];}
00270   PHtmlTok GetTok(const int& TokN, THtmlLxSym& Sym, TStr& Str) const {
00271     Sym=TokV[TokN]->GetSym(); Str=TokV[TokN]->GetStr(); return TokV[TokN];}
00272   void AddTokV(const THtmlTokV& _TokV){TokV.AddV(_TokV);}
00273 
00274   static TStr GetTxtLnDoc(const TStr& HtmlStr);
00275   static TStr GetTxtLnDoc(const TStr& HtmlStr, const TStr& BaseUrlStr,
00276    const bool& OutUrlP, const bool& OutTagsP);
00277 
00278   static PHtmlDoc LoadTxt(
00279    const TStr& FNm, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){
00280     PSIn SIn=TFIn::New(FNm); return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));}
00281   void SaveTxt(const PSOut& SOut, const bool& TxtMode=true) const;
00282 
00283   static void SaveHtmlToTxt(
00284    const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr,
00285    const bool& OutUrlP, const bool& OutToksP);
00286   static void SaveHtmlToTxt(
00287    const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr,
00288    const bool& OutUrlP, const bool& OutToksP);
00289   static void SaveHtmlToXml(
00290    const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr,
00291    const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
00292    const bool& OutTagsP, const bool& OutArgsP);
00293   static void SaveHtmlToXml(
00294    const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr,
00295    const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
00296    const bool& OutTagsP, const bool& OutArgsP);
00297 
00298   static TLxSym GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA);
00299 
00300   static bool _IsTagRedir(
00301    const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx,
00302    const TStr& BaseUrlStr, const TStr& RedirUrlStr);
00303   static TStr GetRedirHtmlDocStr(const TStr& HtmlStr,
00304    const TStr& BaseUrlStr, const TStr& RedirUrlStr);
00305 };
00306 
00308 // Html-Hyper-Link-Document-Vector
00309 ClassTP(THtmlHldV, PHtmlHldV)//{
00310 private:
00311   PHtmlDoc RefHtmlDoc;
00312   THtmlDocV HldV;
00313 public:
00314   THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen=10);
00315   THtmlHldV(TSIn&){Fail;}
00316   static PHtmlHldV Load(TSIn&){Fail; return NULL;}
00317   void Save(TSOut&){Fail;}
00318 
00319   THtmlHldV& operator=(const THtmlHldV&){Fail; return *this;}
00320 
00321   PHtmlDoc GetRefHtmlDoc(){return RefHtmlDoc;}
00322   int GetHlds(){return HldV.Len();}
00323   PHtmlDoc GetHld(const int& HldN){return HldV[HldN];}
00324 };
00325 
00327 // Web-Page
00328 ClassTPV(TWebPg, PWebPg, TWebPgV)//{
00329 private:
00330   TStrV UrlStrV;
00331   TStrV IpNumV;
00332   PHttpResp HttpResp;
00333   uint64 FetchMSecs;
00334 public:
00335   TWebPg(): UrlStrV(), IpNumV(), HttpResp(){}
00336   TWebPg(const TStrV& _UrlStrV, const TStrV& _IpNumV, const PHttpResp& _HttpResp):
00337     UrlStrV(_UrlStrV), IpNumV(_IpNumV), HttpResp(_HttpResp){}
00338   static PWebPg New(const TStrV& UrlStrV, const TStrV& IpNumV, const PHttpResp& HttpResp){
00339     return new TWebPg(UrlStrV, IpNumV, HttpResp);}
00340   static PWebPg New(const TStrV& UrlStrV, const PHttpResp& HttpResp){
00341     return new TWebPg(UrlStrV, TStrV(), HttpResp);}
00342   static PWebPg New(const TStr& UrlStr, const PHttpResp& HttpResp){
00343     TStrV UrlStrV; UrlStrV.Add(UrlStr);
00344     return new TWebPg(UrlStrV, TStrV(), HttpResp);}
00345   ~TWebPg(){}
00346   TWebPg(TSIn&){Fail;}
00347   static PWebPg Load(TSIn&){Fail; return NULL;}
00348   void Save(TSOut&){Fail;}
00349 
00350   TWebPg& operator=(const TWebPg&){Fail; return *this;}
00351 
00352   int GetUrls() const {return UrlStrV.Len();}
00353   TStr GetUrlStr(const int& UrlN=-1) const {
00354     if (UrlN==-1){return UrlStrV.Last();} else {return UrlStrV[UrlN];}}
00355   PUrl GetUrl(const int& UrlN=-1) const {
00356     TStr UrlStr;
00357     if (UrlN==-1){UrlStr=UrlStrV.Last();} else {UrlStr=UrlStrV[UrlN];}
00358     return TUrl::New(UrlStr);}
00359 
00360   int GetIps() const {return IpNumV.Len();}
00361   TStr GetIpNum(const int& IpN=-1) const {
00362     if (IpN==-1){return IpNumV.Last();} else {return IpNumV[IpN];}}
00363 
00364   PHttpResp GetHttpResp() const {return HttpResp;}
00365   TStr GetHttpHdStr() const {return GetHttpResp()->GetHdStr();}
00366   TStr GetHttpBodyAsStr() const {return GetHttpResp()->GetBodyAsStr();}
00367   //void GetOutUrlStrV(TStrV& OutUrlStrV) const;
00368   void GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const;
00369   void GetOutUrlV(TUrlV& OutUrlV) const {
00370     TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
00371   void GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const;
00372 
00373   // fetch time
00374   void PutFetchMSecs(const uint64& _FetchMSecs){FetchMSecs=_FetchMSecs;}
00375   uint64 GetFetchMSecs() const {return FetchMSecs;}
00376 
00377   void SaveAsHttpBody(const TStr& FNm) const;
00378   void SaveAsHttp(const TStr& FNm) const;
00379 
00380   bool IsTxt() const;
00381 };