SNAP Library 2.0, Developer Reference  2013-05-13 16:33:57
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
html.h
Go to the documentation of this file.
00001 #include "bd.h"
00002 
00004 // Forward
00005 ClassHdTP(THtmlTok, PHtmlTok)
00006 ClassHdTP(THtmlDoc, PHtmlDoc)
00007 
00009 // Html-Lexical-Chars
00010 typedef enum {
00011   hlctSpace, hlctAlpha, hlctNum, hlctSym,
00012   hlctLTag, hlctRTag, hlctEof} THtmlLxChTy;
00013 
00014 ClassTP(THtmlLxChDef, PHtmlLxChDef)//{
00015 private:
00016   TIntV ChTyV;
00017   TChV UcChV;
00018   TChV LcChV;
00019   TStrStrH EscStrH;
00020   void SetUcCh(const char& UcCh, const char& LcCh);
00021   void SetUcCh(const TStr& Str);
00022   void SetChTy(const THtmlLxChTy& ChTy, const TStr& Str);
00023   void SetEscStr(const TStr& SrcStr, const TStr& DstStr);
00024 public:
00025   THtmlLxChDef();
00026   THtmlLxChDef(TSIn& SIn): ChTyV(SIn), UcChV(SIn), LcChV(SIn), EscStrH(SIn){}
00027   static PHtmlLxChDef Load(TSIn& SIn){return new THtmlLxChDef(SIn);}
00028   void Save(TSOut& SOut){
00029     ChTyV.Save(SOut); UcChV.Save(SOut); LcChV.Save(SOut); EscStrH.Save(SOut);}
00030 
00031   THtmlLxChDef& operator=(const THtmlLxChDef&){Fail; return *this;}
00032 
00033   // character type operations
00034   int GetChTy(const char& Ch) const {return ChTyV[Ch-TCh::Mn];}
00035   bool IsEoln(const char& Ch) const {return (Ch==TCh::CrCh)||(Ch==TCh::LfCh);}
00036   bool IsWs(const char& Ch) const {
00037     return (Ch==' ')||(Ch==TCh::TabCh)||(Ch==TCh::CrCh)||(Ch==TCh::LfCh);}
00038   bool IsSpace(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSpace;}
00039   bool IsAlpha(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctAlpha;}
00040   bool IsNum(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctNum;}
00041   bool IsAlNum(const char& Ch) const {
00042     return (int(ChTyV[Ch-TCh::Mn])==hlctAlpha)||(int(ChTyV[Ch-TCh::Mn])==hlctNum);}
00043   bool IsSym(const char& Ch) const {return int(ChTyV[Ch-TCh::Mn])==hlctSym;}
00044   bool IsUrl(const char& Ch) const {
00045     int ChTy=ChTyV[Ch-TCh::Mn];
00046     return (ChTy==hlctAlpha)||(ChTy==hlctNum)||
00047      (Ch=='.')||(Ch=='-')||(Ch==':')||(Ch=='/')||(Ch=='~');}
00048 
00049   // upper/lower-case & escape-string operations
00050   bool IsUc(const char& Ch) const {return Ch==UcChV[Ch-TCh::Mn];}
00051   bool IsLc(const char& Ch) const {return Ch==LcChV[Ch-TCh::Mn];}
00052   char GetUc(const char& Ch) const {return UcChV[Ch-TCh::Mn];}
00053   char GetLc(const char& Ch) const {return LcChV[Ch-TCh::Mn];}
00054   void GetUcChA(TChA& ChA) const {
00055     for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetUc(ChA[ChN]));}}
00056   void GetLcChA(TChA& ChA) const {
00057     for (int ChN=0; ChN<ChA.Len(); ChN++){ChA.PutCh(ChN, GetLc(ChA[ChN]));}}
00058   TStr GetUcStr(const TStr& Str) const {
00059     TChA ChA(Str); GetUcChA(ChA); return ChA;}
00060   TStr GetLcStr(const TStr& Str) const {
00061     TChA ChA(Str); GetLcChA(ChA); return ChA;}
00062   TStr GetEscStr(const TStr& Str) const;
00063 
00064   // standard entry points
00065   static PHtmlLxChDef ChDef;
00066   static PHtmlLxChDef GetChDef(){IAssert(!ChDef.Empty()); return ChDef;}
00067   static THtmlLxChDef& GetChDefRef(){IAssert(!ChDef.Empty()); return *ChDef;}
00068 
00069   // character-set transformations
00070   static TStr GetCSZFromYuascii(const TChA& ChA);
00071   static TStr GetCSZFromWin1250(const TChA& ChA);
00072   static TStr GetWin1250FromYuascii(const TChA& ChA);
00073   static TStr GetIsoCeFromYuascii(const TChA& ChA);
00074 };
00075 
00077 // Html-Lexical
00078 typedef enum {
00079   hsyUndef, hsyStr, hsyNum, hsySSym, hsyUrl,
00080   hsyBTag, hsyETag, hsyMTag, hsyEof} THtmlLxSym;
00081 
00082 class THtmlLx{
00083 private:
00084   static THtmlLxChDef ChDef;
00085   PSIn SIn;
00086   TSIn& RSIn;
00087   bool DoParseArg;
00088   TChA ChStack;
00089   char Ch;
00090   int ChX;
00091   bool EscCh;
00092   TChA EscChA;
00093   TChA ArgNm;
00094   TChA ArgVal;
00095   void GetCh(){
00096     if (ChStack.Empty()){
00097       if (RSIn.Eof()){Ch=TCh::EofCh;} else {Ch=RSIn.GetCh(); ChX++;}
00098     } else {
00099       Ch=ChStack.Pop(); ChX++;
00100     }
00101     SymChA+=Ch;
00102   }
00103   void GetEscCh();
00104   void GetMetaTag();
00105   void GetTag();
00106 public:
00107   THtmlLxSym Sym;
00108   int SymBChX, SymEChX;
00109   TChA ChA;
00110   TChA UcChA;
00111   TChA SymChA;
00112   int PreSpaces;
00113   TChA PreSpaceChA;
00114   typedef TStrKdV TArgNmValV;
00115   TArgNmValV ArgNmValV;
00116 public:
00117   THtmlLx(const PSIn& _SIn, const bool& _DoParseArg=true):
00118     SIn(_SIn), RSIn(*SIn), DoParseArg(_DoParseArg),
00119     ChStack(), Ch(' '), ChX(0), EscCh(false),
00120     EscChA(), ArgNm(), ArgVal(),
00121     Sym(hsyUndef), SymBChX(0), SymEChX(0), ChA(), UcChA(),
00122     PreSpaces(0), PreSpaceChA(), ArgNmValV(){}
00123 
00124   THtmlLx& operator=(const THtmlLx&){Fail; return *this;}
00125 
00126   void PutCh(const char& _Ch){
00127     ChStack.Push(Ch); if (!SymChA.Empty()){SymChA.Pop();} Ch=_Ch; ChX--;}
00128   void PutStr(const TStr& Str){
00129     for (int ChN=Str.Len()-1; ChN>=0; ChN--){PutCh(Str[ChN]);}}
00130   THtmlLxSym GetSym();
00131   PHtmlTok GetTok(const bool& DoUc=true);
00132   TStr GetPreSpaceStr() const {
00133     return TStr::GetSpaceStr(PreSpaces);}
00134 
00135   int GetArgs() const {return ArgNmValV.Len();}
00136   TStr GetArgNm(const int& ArgN) const {return ArgNmValV[ArgN].Key;}
00137   TStr GetArgVal(const int& ArgN) const {return ArgNmValV[ArgN].Dat;}
00138   bool IsArg(const TStr& ArgNm) const {return ArgNmValV.IsIn(TStrKd(ArgNm));}
00139   TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal=TStr()) const {
00140     int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm));
00141     if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}}
00142   void PutArg(const TStr& ArgNm, const TStr& ArgVal){
00143     int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm));
00144     if (ArgN==-1){ArgNmValV.Add(TStrKd(ArgNm, ArgVal));}
00145     else {ArgNmValV[ArgN]=TStrKd(ArgNm, ArgVal);}}
00146   TStr GetFullBTagStr() const;
00147 
00148   void MoveToStrOrEof(const TStr& Str);
00149   void MoveToBTagOrEof(const TStr& TagNm);
00150   void MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2);
00151   void MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3);
00152   void MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm);
00153   void MoveToBTagArgOrEof(
00154    const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal);
00155   void MoveToBTagArg2OrEof(const TStr& TagNm,
00156    const TStr& ArgNm1, const TStr& ArgVal1,
00157    const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP=true);
00158   void MoveToBTagOrEof(
00159    const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1,
00160    const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2);
00161   void MoveToETagOrEof(const TStr& TagNm);
00162   TStr GetTextOnlyStrToEof();
00163   TStr GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP=false);
00164   TStr GetStrToBTag(const TStr& TagNm, const TStr& ArgNm,
00165    const TStr& ArgVal, const bool& TxtOnlyP=false);
00166   TStr GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP=false);
00167   TStr GetStrToETag2(const TStr& TagNm1, const TStr& TagNm2, const bool& TxtOnlyP=false);
00168   TStr GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP=false);
00169   TStr GetHRefBeforeStr(const TStr& Str);
00170   bool IsGetBTag(const TStr& TagNm);
00171   bool IsGetETag(const TStr& TagNm);
00172 
00173   static TStr GetSymStr(const THtmlLxSym& Sym);
00174   static TStr GetEscapedStr(const TChA& ChA);
00175   static TStr GetAsciiStr(const TChA& ChA, const char& GenericCh='_');
00176   static void GetTokStrV(const TStr& Str, TStrV& TokStrV);
00177   static TStr GetNoTag(const TStr& Str);
00178 };
00179 
00181 // Html-Token
00182 ClassTPV(THtmlTok, PHtmlTok, THtmlTokV)//{
00183 private:
00184   THtmlLxSym Sym;
00185   TStr Str;
00186   THtmlLx::TArgNmValV ArgNmValV;
00187 public:
00188   THtmlTok(): Sym(hsyUndef), Str(), ArgNmValV(){}
00189   THtmlTok(const THtmlLxSym& _Sym):
00190     Sym(_Sym), Str(), ArgNmValV(){}
00191   THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str):
00192     Sym(_Sym), Str(_Str), ArgNmValV(){}
00193   THtmlTok(const THtmlLxSym& _Sym, const TStr& _Str,
00194    const THtmlLx::TArgNmValV& _ArgNmValV):
00195     Sym(_Sym), Str(_Str), ArgNmValV(_ArgNmValV){}
00196   THtmlTok(TSIn&){Fail;}
00197   static PHtmlTok Load(TSIn&){Fail; return NULL;}
00198   void Save(TSOut&){Fail;}
00199 
00200   THtmlTok& operator=(const THtmlTok&){Fail; return *this;}
00201 
00202   THtmlLxSym GetSym() const {return Sym;}
00203   TStr GetStr() const {return Str;}
00204   TStr GetFullStr() const;
00205   bool IsArg(const TStr& ArgNm) const {
00206     return ArgNmValV.SearchForw(TStrKd(ArgNm))!=-1;}
00207   TStr GetArg(const TStr& ArgNm) const {
00208     return ArgNmValV[ArgNmValV.SearchForw(TStrKd(ArgNm))].Dat;}
00209   TStr GetArg(const TStr& ArgNm, const TStr& DfArgVal) const {
00210     int ArgN=ArgNmValV.SearchForw(TStrKd(ArgNm));
00211     if (ArgN==-1){return DfArgVal;} else {return ArgNmValV[ArgN].Dat;}}
00212   bool IsUrlTok(TStr& RelUrlStr) const;
00213   bool IsRedirUrlTok() const;
00214 
00215   void SaveTxt(const PSOut& SOut, const bool& TxtMode=true);
00216 
00217   static const TStr ATagNm;
00218   static const TStr AreaTagNm;
00219   static const TStr BrTagNm;
00220   static const TStr CardTagNm;
00221   static const TStr CenterTagNm;
00222   static const TStr FrameTagNm;
00223   static const TStr H1TagNm;
00224   static const TStr H2TagNm;
00225   static const TStr H3TagNm;
00226   static const TStr H4TagNm;
00227   static const TStr H5TagNm;
00228   static const TStr H6TagNm;
00229   static const TStr ImgTagNm;
00230   static const TStr LiTagNm;
00231   static const TStr MetaTagNm;
00232   static const TStr PTagNm;
00233   static const TStr UlTagNm;
00234   static const TStr TitleTagNm;
00235   static const TStr TitleETagNm;
00236 
00237   static const TStr AltArgNm;
00238   static const TStr HRefArgNm;
00239   static const TStr SrcArgNm;
00240   static const TStr TitleArgNm;
00241   static const TStr HttpEquivArgNm;
00242 
00243   static bool IsBreakTag(const TStr& TagNm);
00244   static bool IsBreakTok(const PHtmlTok& Tok);
00245   static bool IsHTag(const TStr& TagNm, int& HTagN);
00246   static PHtmlTok GetHTok(const bool& IsBTag, const int& HTagN);
00247 };
00248 
00250 // Html-Document
00251 typedef enum {
00252   hdtAll, hdtStr, hdtStrNum, hdtTag, hdtA, hdtHRef, hdtUL} THtmlDocType;
00253 
00254 ClassTPV(THtmlDoc, PHtmlDoc, THtmlDocV)//{
00255 private:
00256   THtmlTokV TokV;
00257 public:
00258   THtmlDoc(): TokV(){}
00259   THtmlDoc(
00260    const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true);
00261   static PHtmlDoc New(
00262    const PSIn& SIn, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){
00263     return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));}
00264   THtmlDoc(TSIn&){Fail;}
00265   static PHtmlDoc Load(TSIn&){Fail; return NULL;}
00266   void Save(TSOut&){Fail;}
00267 
00268   THtmlDoc& operator=(const THtmlDoc&){Fail; return *this;}
00269 
00270   int GetToks() const {return TokV.Len();}
00271   PHtmlTok GetTok(const int& TokN) const {return TokV[TokN];}
00272   PHtmlTok GetTok(const int& TokN, THtmlLxSym& Sym, TStr& Str) const {
00273     Sym=TokV[TokN]->GetSym(); Str=TokV[TokN]->GetStr(); return TokV[TokN];}
00274   void AddTokV(const THtmlTokV& _TokV){TokV.AddV(_TokV);}
00275 
00276   static TStr GetTxtLnDoc(const TStr& HtmlStr);
00277   static TStr GetTxtLnDoc(const TStr& HtmlStr, const TStr& BaseUrlStr,
00278    const bool& OutUrlP, const bool& OutTagsP);
00279 
00280   static PHtmlDoc LoadTxt(
00281    const TStr& FNm, const THtmlDocType& Type=hdtAll, const bool& DoUc=true){
00282     PSIn SIn=TFIn::New(FNm); return PHtmlDoc(new THtmlDoc(SIn, Type, DoUc));}
00283   void SaveTxt(const PSOut& SOut, const bool& TxtMode=true) const;
00284 
00285   static void SaveHtmlToTxt(
00286    const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr,
00287    const bool& OutUrlP, const bool& OutToksP);
00288   static void SaveHtmlToTxt(
00289    const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr,
00290    const bool& OutUrlP, const bool& OutToksP);
00291   static void SaveHtmlToXml(
00292    const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr,
00293    const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
00294    const bool& OutTagsP, const bool& OutArgsP);
00295   static void SaveHtmlToXml(
00296    const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr,
00297    const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
00298    const bool& OutTagsP, const bool& OutArgsP);
00299 
00300   static TLxSym GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA);
00301 
00302   static bool _IsTagRedir(
00303    const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx,
00304    const TStr& BaseUrlStr, const TStr& RedirUrlStr);
00305   static TStr GetRedirHtmlDocStr(const TStr& HtmlStr,
00306    const TStr& BaseUrlStr, const TStr& RedirUrlStr);
00307 };
00308 
00310 // Html-Hyper-Link-Document-Vector
00311 ClassTP(THtmlHldV, PHtmlHldV)//{
00312 private:
00313   PHtmlDoc RefHtmlDoc;
00314   THtmlDocV HldV;
00315 public:
00316   THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen=10);
00317   THtmlHldV(TSIn&){Fail;}
00318   static PHtmlHldV Load(TSIn&){Fail; return NULL;}
00319   void Save(TSOut&){Fail;}
00320 
00321   THtmlHldV& operator=(const THtmlHldV&){Fail; return *this;}
00322 
00323   PHtmlDoc GetRefHtmlDoc(){return RefHtmlDoc;}
00324   int GetHlds(){return HldV.Len();}
00325   PHtmlDoc GetHld(const int& HldN){return HldV[HldN];}
00326 };
00327 
00329 // Web-Page
00330 ClassTPV(TWebPg, PWebPg, TWebPgV)//{
00331 private:
00332   TStrV UrlStrV;
00333   TStrV IpNumV;
00334   PHttpResp HttpResp;
00335   uint64 FetchMSecs;
00336 public:
00337   TWebPg(): UrlStrV(), IpNumV(), HttpResp(){}
00338   TWebPg(const TStrV& _UrlStrV, const TStrV& _IpNumV, const PHttpResp& _HttpResp):
00339     UrlStrV(_UrlStrV), IpNumV(_IpNumV), HttpResp(_HttpResp){}
00340   static PWebPg New(const TStrV& UrlStrV, const TStrV& IpNumV, const PHttpResp& HttpResp){
00341     return new TWebPg(UrlStrV, IpNumV, HttpResp);}
00342   static PWebPg New(const TStrV& UrlStrV, const PHttpResp& HttpResp){
00343     return new TWebPg(UrlStrV, TStrV(), HttpResp);}
00344   static PWebPg New(const TStr& UrlStr, const PHttpResp& HttpResp){
00345     TStrV UrlStrV; UrlStrV.Add(UrlStr);
00346     return new TWebPg(UrlStrV, TStrV(), HttpResp);}
00347   ~TWebPg(){}
00348   TWebPg(TSIn&){Fail;}
00349   static PWebPg Load(TSIn&){Fail; return NULL;}
00350   void Save(TSOut&){Fail;}
00351 
00352   TWebPg& operator=(const TWebPg&){Fail; return *this;}
00353 
00354   int GetUrls() const {return UrlStrV.Len();}
00355   TStr GetUrlStr(const int& UrlN=-1) const {
00356     if (UrlN==-1){return UrlStrV.Last();} else {return UrlStrV[UrlN];}}
00357   PUrl GetUrl(const int& UrlN=-1) const {
00358     TStr UrlStr;
00359     if (UrlN==-1){UrlStr=UrlStrV.Last();} else {UrlStr=UrlStrV[UrlN];}
00360     return TUrl::New(UrlStr);}
00361 
00362   int GetIps() const {return IpNumV.Len();}
00363   TStr GetIpNum(const int& IpN=-1) const {
00364     if (IpN==-1){return IpNumV.Last();} else {return IpNumV[IpN];}}
00365 
00366   PHttpResp GetHttpResp() const {return HttpResp;}
00367   TStr GetHttpHdStr() const {return GetHttpResp()->GetHdStr();}
00368   TStr GetHttpBodyAsStr() const {return GetHttpResp()->GetBodyAsStr();}
00369   //void GetOutUrlStrV(TStrV& OutUrlStrV) const;
00370   void GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const;
00371   void GetOutUrlV(TUrlV& OutUrlV) const {
00372     TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
00373   void GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const;
00374 
00375   // fetch time
00376   void PutFetchMSecs(const uint64& _FetchMSecs){FetchMSecs=_FetchMSecs;}
00377   uint64 GetFetchMSecs() const {return FetchMSecs;}
00378 
00379   void SaveAsHttpBody(const TStr& FNm) const;
00380   void SaveAsHttp(const TStr& FNm) const;
00381 
00382   bool IsTxt() const;
00383 };