SNAP Library, User Reference  2012-10-15 15:06:59
SNAP, a general purpose network analysis and graph mining library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
html.cpp
Go to the documentation of this file.
00001 
00002 // Html-Lexical-Chars
00003 void THtmlLxChDef::SetUcCh(const char& UcCh, const char& LcCh){
00004   // update upper-case (more lower cases may have one upper case)
00005   IAssert(
00006    (UcChV[LcCh-TCh::Mn]==TCh(0))||
00007    (UcChV[LcCh-TCh::Mn]==TCh(LcCh)));
00008   UcChV[LcCh-TCh::Mn]=TCh(UcCh);
00009   // update lower-case (one upper case may have only one lower case)
00010   if ((LcChV[UcCh-TCh::Mn]==TCh(0))||(LcChV[UcCh-TCh::Mn]==TCh(UcCh))){
00011     LcChV[UcCh-TCh::Mn]=TCh(LcCh);
00012   }
00013 }
00014 
00015 void THtmlLxChDef::SetUcCh(const TStr& Str){
00016   // set type of characters as letters
00017   SetChTy(hlctAlpha, Str);
00018   // first char in string is upper-case, rest are lower-case
00019   for (int ChN=1; ChN<Str.Len(); ChN++){
00020     SetUcCh(Str[0], Str[ChN]);
00021   }
00022 }
00023 
00024 void THtmlLxChDef::SetChTy(const THtmlLxChTy& ChTy, const TStr& Str){
00025   for (int ChN=0; ChN<Str.Len(); ChN++){
00026     ChTyV[Str[ChN]-TCh::Mn]=TInt(ChTy);}
00027 }
00028 
00029 void THtmlLxChDef::SetEscStr(const TStr& SrcStr, const TStr& DstStr){
00030   EscStrH.AddDat(SrcStr, DstStr);
00031 }
00032 
00033 TStr THtmlLxChDef::GetEscStr(const TStr& Str) const {
00034   int EscStrId;
00035   if ((EscStrId=EscStrH.GetKeyId(Str))!=-1){
00036     return EscStrH[EscStrId];
00037   } else
00038   if ((Str.Len()>=2)&&(Str[0]=='&')&&(Str[1]=='#')){
00039     int ChCd=0;
00040     for (int ChN=2; ChN<Str.Len(); ChN++){
00041       if (ChCd<=0xFFFF){ChCd=ChCd*10+Str[ChN]-'0';}}
00042     return TStr((char)ChCd);
00043   } else {
00044     return TStr(' ');
00045   }
00046 }
00047 
00048 THtmlLxChDef::THtmlLxChDef():
00049   ChTyV(TCh::Vals), UcChV(TCh::Vals), LcChV(TCh::Vals), EscStrH(100){
00050 
00051   // Character-Types
00052   ChTyV.PutAll(TInt(hlctSpace));
00053   SetChTy(hlctAlpha, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
00054   SetChTy(hlctAlpha, "abcdefghijklmnopqrstuvwxyz");
00055   SetChTy(hlctAlpha, "@_");
00056   SetChTy(hlctNum, "0123456789");
00057   SetChTy(hlctSym, "`~!#$%^&*()-=+[{]}\\|;:'\",<.>/?");
00058   SetChTy(hlctLTag, "<"); SetChTy(hlctRTag, ">");
00059   SetChTy(hlctEof, TStr(TCh::EofCh));
00060   for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
00061     if ((Ch<0)||(127<Ch)){SetChTy(hlctAlpha, TStr(TCh(char(Ch))));}}
00062   //SetChTy(hlctSpace, TStr(TCh(char(160))));
00063 
00064   // Upper-Case
00065   {for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
00066     SetUcCh(char(Ch), char(Ch));}}
00067   SetUcCh("Aa"); SetUcCh("\xc0\xe0"); SetUcCh("\xc1\xe1"); SetUcCh("\xc2\xe2");
00068   SetUcCh("\xc3\xe3"); SetUcCh("\xc4\xe4"); SetUcCh("\xc5\xe5"); SetUcCh("\xc6\xe6");
00069   SetUcCh("Bb"); SetUcCh("Cc"); SetUcCh("\xc7\xe7"); SetUcCh("Dd");
00070   SetUcCh("\xd0\xf0"); SetUcCh("Ee"); SetUcCh("\xc8\xe8"); SetUcCh("\xc9\xe9");
00071   SetUcCh("\xca\xea"); SetUcCh("\xcb\xeb"); SetUcCh("Ff"); SetUcCh("Gg");
00072   SetUcCh("Hh"); SetUcCh("Ii"); SetUcCh("\xcc\xec"); SetUcCh("\xcd\xed");
00073   SetUcCh("\xce\xee"); SetUcCh("\xcf\xef"); SetUcCh("Jj"); SetUcCh("Kk");
00074   SetUcCh("Ll"); SetUcCh("Mm"); SetUcCh("Nn"); SetUcCh("\xd1\xf1");
00075   SetUcCh("Oo"); SetUcCh("\xd2\xf2"); SetUcCh("\xd3\xf3"); SetUcCh("\xd4\xf4");
00076   SetUcCh("\xd5\xf5"); SetUcCh("\xd6\xf6"); SetUcCh("\xd8\xf8"); SetUcCh("Pp");
00077   SetUcCh("Qq"); SetUcCh("Rr"); SetUcCh("Ss"); SetUcCh("\x8a\x9a");
00078   SetUcCh("Tt"); SetUcCh("Uu"); SetUcCh("\xd9\xf9"); SetUcCh("\xda\xfa");
00079   SetUcCh("\xdb\xfb"); SetUcCh("\xdc\xfc"); SetUcCh("Vv"); SetUcCh("Ww");
00080   SetUcCh("Xx"); SetUcCh("Yy\xff"); SetUcCh("\xdd\xfd"); SetUcCh("Zz");
00081   SetUcCh("\x8e\x9e");
00082   // ISO-CE
00083   //SetUcCh(uchar(169), uchar(185)); /*Sh - \xa9\xb9*/
00084   //SetUcCh(uchar(174), uchar(190)); /*Zh - \xae\xbe*/
00085   //SetUcCh(uchar(200), uchar(232)); /*Ch - \xc8\xe8*/
00086   //SetUcCh(uchar(198), uchar(230)); /*Cs - \xc6\xe6*/
00087   //SetUcCh(uchar(208), uchar(240)); /*Dz - \xd0\xf0*/
00088 
00089   // Annoying Unicode-characters
00090   //SetChTy(hlctSpace, "\xc2\xef");
00091 
00092   // Escape-Sequences
00093   SetEscStr("&quot", "\""); SetEscStr("&amp", "&");
00094   SetEscStr("&lt", "<"); SetEscStr("&gt", ">");
00095   SetEscStr("&nbsp", " ");
00096 
00097   SetEscStr("&auml", "\xe4"); SetEscStr("&Auml", "\xc4");
00098   SetEscStr("&ouml", "\xf6"); SetEscStr("&Ouml", "\xd6");
00099   SetEscStr("&uuml", "\xfc"); SetEscStr("&Uuml", "\xdc");
00100   SetEscStr("&aring", "\xe5"); SetEscStr("&Aring", "\xc5");
00101   SetEscStr("&oslash", "\xf8"); SetEscStr("&Oslash", "\xd8");
00102   SetEscStr("&Aelig", "\xc6"); SetEscStr("&aelig", "\xe6");
00103 
00104   SetEscStr("&eacute", "e"); SetEscStr("&Eacute", "E");
00105   SetEscStr("&egrave", "e"); SetEscStr("&Egrave", "E");
00106   SetEscStr("&agrave", "a"); SetEscStr("&Agrave", "A");
00107 }
00108 
00109 PHtmlLxChDef THtmlLxChDef::ChDef=PHtmlLxChDef(new THtmlLxChDef());
00110 
00111 TStr THtmlLxChDef::GetCSZFromYuascii(const TChA& ChA){
00112   TChA DstChA;
00113   for (int ChN=0; ChN<ChA.Len(); ChN++){
00114     char Ch=ChA[ChN];
00115     switch (Ch){
00116       case '~': DstChA+='c'; break;
00117       case '^': DstChA+='C'; break;
00118       case '}': DstChA+='c'; break;
00119       case ']': DstChA+='C'; break;
00120       case '|': DstChA+='d'; break;
00121       case '\\': DstChA+='D'; break;
00122       case '{': DstChA+='s'; break;
00123       case '[': DstChA+='S'; break;
00124       case '`': DstChA+='z'; break;
00125       case '@': DstChA+='Z'; break;
00126       default: DstChA+=Ch;
00127     }
00128   }
00129   return DstChA;
00130 }
00131 
00132 TStr THtmlLxChDef::GetCSZFromWin1250(const TChA& ChA){
00133   TChA DstChA;
00134   for (int ChN=0; ChN<ChA.Len(); ChN++){
00135     const uchar Ch=ChA[ChN];
00136     switch (Ch){
00137       case 232: DstChA+='c'; break;
00138       case 200: DstChA+='C'; break;
00139       case 154: DstChA+='s'; break;
00140       case 138: DstChA+='S'; break;
00141       case 158: DstChA+='z'; break;
00142       case 142: DstChA+='Z'; break;
00143       default: DstChA+=Ch;
00144     }
00145   }
00146   return DstChA;
00147 }
00148 
00149 TStr THtmlLxChDef::GetWin1250FromYuascii(const TChA& ChA){
00150   TChA DstChA;
00151   for (int ChN=0; ChN<ChA.Len(); ChN++){
00152     char Ch=ChA[ChN];
00153     switch (Ch){
00154       case '~': DstChA+=uchar(232); break;
00155       case '^': DstChA+=uchar(200); break;
00156       case '}': DstChA+='c'; break;
00157       case ']': DstChA+='C'; break;
00158       case '|': DstChA+='d'; break;
00159       case '\\': DstChA+='D'; break;
00160       case '{': DstChA+=uchar(154); break;
00161       case '[': DstChA+=uchar(138); break;
00162       case '`': DstChA+=uchar(158); break;
00163       case '@': DstChA+=uchar(142); break;
00164       default: DstChA+=Ch;
00165     }
00166   }
00167   return DstChA;
00168 }
00169 
00170 TStr THtmlLxChDef::GetIsoCeFromYuascii(const TChA& ChA){
00171   TChA DstChA;
00172   for (int ChN=0; ChN<ChA.Len(); ChN++){
00173     char Ch=ChA[ChN];
00174     switch (Ch){
00175       case '~': DstChA+=uchar(232); break;
00176       case '^': DstChA+=uchar(200); break;
00177       case '}': DstChA+=uchar(230); break;
00178       case ']': DstChA+=uchar(198); break;
00179       case '|': DstChA+=uchar(240); break;
00180       case '\\': DstChA+=uchar(208); break;
00181       case '{': DstChA+=uchar(185); break;
00182       case '[': DstChA+=uchar(169); break;
00183       case '`': DstChA+=uchar(190); break;
00184       case '@': DstChA+=uchar(174); break;
00185       default: DstChA+=Ch;
00186     }
00187   }
00188   return DstChA;
00189 }
00190 
00192 // Html-Lexical
00193 THtmlLxChDef THtmlLx::ChDef;
00194 
00195 void THtmlLx::GetEscCh(){
00196   GetCh();
00197   EscCh=(Ch=='&');
00198   if (EscCh){
00199     EscChA.Clr(); EscChA.AddCh(Ch); GetCh();
00200     if (Ch=='#'){
00201       EscChA.AddCh(Ch); GetCh();
00202       if (('0'<=Ch)&&(Ch<='9')){
00203         do {EscChA.AddCh(Ch); GetCh();} while (('0'<=Ch)&&(Ch<='9'));
00204         if (Ch==';'){GetCh();}
00205         PutStr(ChDef.GetEscStr(EscChA));
00206       } else {
00207         PutCh('#'); PutCh('&');
00208       }
00209     } else
00210     if ((('a'<=Ch)&&(Ch<='z'))||(('A'<=Ch)&&(Ch<='Z'))){
00211       do {
00212         EscChA.AddCh(Ch); GetCh();
00213       } while ((('A'<=Ch)&&(Ch<='Z'))||(('a'<=Ch)&&(Ch<='z'))||(('0'<=Ch)&&(Ch<='9')));
00214       if (Ch==';'){
00215         GetCh(); PutStr(ChDef.GetEscStr(EscChA));
00216       } else {
00217         PutStr(EscChA);
00218       }      
00219     } else {
00220       PutCh('&');
00221     }
00222   }
00223 }
00224 
00225 void THtmlLx::GetMetaTag(){
00226   Sym=hsyMTag;
00227   if (Ch=='-'){
00228     char PCh=' ';
00229     while ((Ch!=TCh::EofCh) && ((PCh!='-')||(Ch!='>'))){PCh=Ch; GetCh();}
00230   } else {
00231     while ((Ch!=TCh::EofCh) && (Ch!='>')){GetCh();}
00232   }
00233   if (Ch!=TCh::EofCh){GetEscCh();}
00234 }
00235 
00236 void THtmlLx::GetTag(){
00237   if (Ch=='/'){Sym=hsyETag; GetCh();} else {Sym=hsyBTag;}
00238   UcChA.AddCh('<');
00239   while (ChDef.IsAlNum(Ch)||(Ch==':')){
00240     UcChA.AddCh(ChDef.GetUc(Ch)); GetCh();}
00241   UcChA.AddCh('>');
00242   ChA=UcChA;
00243 
00244   if (DoParseArg){
00245     while ((Ch!='>')&&(Ch!=TCh::EofCh)){
00246       while ((!ChDef.IsAlpha(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
00247       if (ChDef.IsAlpha(Ch)){
00248         ArgNm.Clr(); ArgVal.Clr();
00249         while (ChDef.IsAlNum(Ch)||(Ch=='-')){ArgNm.AddCh(ChDef.GetUc(Ch)); GetCh();}
00250         while (ChDef.IsWs(Ch)){GetCh();}
00251         if (Ch=='='){
00252           GetCh(); while (ChDef.IsWs(Ch)){GetCh();}
00253           if (Ch=='"'){
00254             GetCh();
00255             while ((Ch!=TCh::EofCh)&&(Ch!='"')&&(Ch!='>')){
00256               if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
00257             if (Ch=='"'){GetCh();}
00258           } else if (Ch=='\''){
00259             GetCh();
00260             while ((Ch!=TCh::EofCh)&&(Ch!='\'')&&(Ch!='>')){
00261               if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
00262             if (Ch=='\''){GetCh();}
00263           } else {
00264             while ((!ChDef.IsWs(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){
00265               ArgVal.AddCh(Ch); GetCh();}
00266           }
00267           ArgNmValV.Add(TStrKd(ArgNm, ArgVal));
00268         }
00269       }
00270     }
00271   } else {
00272     while ((Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
00273   }
00274   if (Ch!=TCh::EofCh){GetEscCh();}
00275 }
00276 
00277 THtmlLxSym THtmlLx::GetSym(){
00278   // prepare symbol descriptions
00279   ChA.Clr(); UcChA.Clr();
00280   PreSpaces=0; PreSpaceChA.Clr();
00281   ArgNmValV.Clr();
00282   // skip white-space
00283   while (ChDef.IsSpace(Ch)){
00284     if (ChX>0){PreSpaceChA+=Ch; PreSpaces++;} GetEscCh();}
00285   // parse symbol
00286   SymChA.Clr(); SymChA+=Ch; SymBChX=ChX;
00287   switch (ChDef.GetChTy(Ch)){
00288     case hlctAlpha:
00289       Sym=hsyStr;
00290       forever{
00291         do {
00292           ChA.AddCh(Ch); UcChA.AddCh(ChDef.GetUc(Ch)); GetEscCh();
00293         } while (ChDef.IsAlNum(Ch));
00294         if (Ch=='.'){
00295           GetCh();
00296           if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
00297           else {PutCh(Ch); Ch='.'; break;}
00298         } else {break;}
00299       }
00300       break;
00301     case hlctNum:
00302       Sym=hsyNum;
00303       forever{
00304         do {
00305           ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00306         } while (ChDef.IsNum(Ch));
00307         if (Ch=='.'){
00308           GetCh();
00309           if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
00310           else {PutCh(Ch); Ch='.'; break;}
00311         } else if (ChDef.IsAlpha(Ch)){
00312           Sym=hsyStr;
00313         } else {
00314           break;
00315         }
00316       }
00317       break;
00318     case hlctSym:
00319       Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00320       if ((ChA.LastCh()=='.')&&(ChDef.IsAlNum(Ch))){
00321         Sym=hsyStr;
00322         do {
00323           ChA.AddCh(Ch); UcChA.AddCh(ChDef.GetUc(Ch)); GetEscCh();
00324         } while (ChDef.IsAlNum(Ch));
00325       }
00326       break;
00327     case hlctLTag:
00328       if (EscCh){
00329         Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00330       } else {
00331         GetCh();
00332         if (Ch=='!'){GetCh(); GetMetaTag();} else {GetTag();}
00333       }
00334       break;
00335     case hlctRTag:
00336       if (EscCh){
00337         Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
00338       } else {
00339         Sym=hsySSym; ChA.AddCh(Ch); UcChA.AddCh(Ch);  GetEscCh();
00340       }
00341       break;
00342     case hlctEof: Sym=hsyEof; break;
00343     default: Sym=hsyUndef; GetEscCh();
00344   }
00345   // set symbol last-character-position
00346   SymEChX=ChX-1;
00347   // delete last character
00348   if (!SymChA.Empty()){SymChA.Pop();}
00349   // return symbol
00350   return Sym;
00351 }
00352 
00353 PHtmlTok THtmlLx::GetTok(const bool& DoUc){
00354   if (DoUc){return PHtmlTok(new THtmlTok(Sym, UcChA, ArgNmValV));}
00355   else {return PHtmlTok(new THtmlTok(Sym, ChA, ArgNmValV));}
00356 }
00357 
00358 TStr THtmlLx::GetFullBTagStr() const {
00359   IAssert(Sym==hsyBTag);
00360   TChA BTagChA;
00361   BTagChA+=ChA; BTagChA.Pop();
00362   for (int ArgN=0; ArgN<GetArgs(); ArgN++){
00363     BTagChA+=' '; BTagChA+=GetArgNm(ArgN);
00364     BTagChA+='='; BTagChA+='"'; BTagChA+=GetArgVal(ArgN); BTagChA+='"';
00365   }
00366   BTagChA+='>';
00367   return BTagChA;
00368 }
00369 
00370 void THtmlLx::MoveToStrOrEof(const TStr& Str){
00371   do {
00372     GetSym();
00373   } while ((Sym!=hsyEof)&&((Sym!=hsyStr)||(ChA!=Str)));
00374 }
00375 
00376 void THtmlLx::MoveToBTagOrEof(const TStr& TagNm){
00377   do {
00378     GetSym();
00379   } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||(UcChA!=TagNm)));
00380 }
00381 
00382 void THtmlLx::MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2){
00383   do {
00384     GetSym();
00385   } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2))));
00386 }
00387 
00388 void THtmlLx::MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3){
00389   do {
00390     GetSym();
00391   } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2)&&(UcChA!=TagNm3))));
00392 }
00393 
00394 void THtmlLx::MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm){
00395   do {
00396     GetSym();
00397   } while ((Sym!=hsyEof) && ((Sym!=hsyBTag)||(UcChA!=BTagNm)) && ((Sym!=hsyETag) || (UcChA!=ETagNm)));
00398 }
00399 
00400 void THtmlLx::MoveToBTagArgOrEof(
00401  const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal){
00402   forever {
00403     GetSym();
00404     if (Sym==hsyEof){break;}
00405     if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
00406      (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal)){break;}
00407   }
00408 }
00409 
00410 void THtmlLx::MoveToBTagArg2OrEof(const TStr& TagNm,
00411  const TStr& ArgNm1, const TStr& ArgVal1,
00412  const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP){
00413   forever {
00414     GetSym();
00415     if (Sym==hsyEof){break;}
00416     if (AndOpP){
00417       if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
00418        (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)&&
00419        (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
00420     } else {
00421       if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
00422        (((IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1))||
00423         ((IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)))){break;}
00424     }
00425   }
00426 }
00427 
00428 void THtmlLx::MoveToBTagOrEof(
00429  const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1,
00430  const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2){
00431   forever {
00432     GetSym();
00433     if (Sym==hsyEof){break;}
00434     if ((Sym==hsyBTag)&&(UcChA==TagNm1)&&
00435      (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)){break;}
00436     if ((Sym==hsyBTag)&&(UcChA==TagNm2)&&
00437      (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
00438   }
00439 }
00440 
00441 void THtmlLx::MoveToETagOrEof(const TStr& TagNm){
00442   do {
00443     GetSym();
00444   } while ((Sym!=hsyEof)&&((Sym!=hsyETag)||(UcChA!=TagNm)));
00445 }
00446 
00447 TStr THtmlLx::GetTextOnlyStrToEof(){
00448   TChA OutChA;
00449   forever {
00450     GetSym();
00451     if (Sym==hsyEof){
00452       break;
00453     } else {
00454       if (PreSpaces>0){OutChA+=' ';}
00455       if ((Sym!=hsyBTag)&&(Sym!=hsyETag)){
00456         OutChA+=ChA;}
00457     }
00458   }
00459   return OutChA;
00460 }
00461 
00462 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP){
00463   TChA OutChA;
00464   forever {
00465     GetSym();
00466     if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm))){
00467       break;
00468     } else {
00469       if (PreSpaces>0){OutChA+=' ';}
00470       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00471         OutChA+=ChA;}
00472     }
00473   }
00474   return OutChA;
00475 }
00476 
00477 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const TStr& ArgNm,
00478  const TStr& ArgVal, const bool& TxtOnlyP){
00479   TChA OutChA;
00480   forever {
00481     GetSym();
00482     if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm)&&
00483      (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal))){
00484       break;
00485     } else {
00486       if (PreSpaces>0){OutChA+=' ';}
00487       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00488         OutChA+=ChA;}
00489     }
00490   }
00491   return OutChA;
00492 }
00493 
00494 TStr THtmlLx::GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP){
00495   TChA OutChA;
00496   forever {
00497     GetSym();
00498     if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm))){
00499       break;
00500     } else {
00501       if (PreSpaces>0){OutChA+=' ';}
00502       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00503         OutChA+=ChA;}
00504     }
00505   }
00506   return OutChA;
00507 }
00508 
00509 TStr THtmlLx::GetStrToETag2(const TStr& TagNm1, 
00510  const TStr& TagNm2, const bool& TxtOnlyP){
00511   TChA OutChA;
00512   forever {
00513     GetSym();
00514     if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm1))||((Sym==hsyETag)&&(UcChA==TagNm2))){
00515       break;
00516     } else {
00517       if (PreSpaces>0){OutChA+=' ';}
00518       if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
00519         OutChA+=ChA;}
00520     }
00521   }
00522   return OutChA;
00523 }
00524 
00525 TStr THtmlLx::GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP){
00526   MoveToBTagOrEof(TagNm);
00527   return GetStrToETag(TagNm, TxtOnlyP);
00528 }
00529 
00530 TStr THtmlLx::GetHRefBeforeStr(const TStr& Str){
00531   TStr HRefStr;
00532   forever {
00533     GetSym();
00534     if (Sym==hsyEof){HRefStr=""; break;}
00535     if ((Sym==hsyBTag)&&(UcChA=="<A>")){HRefStr=GetArg("HREF");}
00536     if ((Sym==hsyStr)&&(ChA==Str)){break;}
00537   }
00538   return HRefStr;
00539 }
00540 
00541 bool THtmlLx::IsGetBTag(const TStr& TagNm){
00542   if (GetSym()==hsyBTag){
00543     return ChA==TagNm;
00544   } else {return false;}
00545 }
00546 
00547 bool THtmlLx::IsGetETag(const TStr& TagNm){
00548   if (GetSym()==hsyETag){
00549     return ChA==TagNm;
00550   } else {return false;}
00551 }
00552 
00553 TStr THtmlLx::GetSymStr(const THtmlLxSym& Sym){
00554   switch (Sym){
00555     case hsyUndef: return "Undef";
00556     case hsyStr: return "Str";
00557     case hsyNum: return "Num";
00558     case hsySSym: return "SSym";
00559     case hsyUrl: return "Url";
00560     case hsyBTag: return "BTag";
00561     case hsyETag: return "ETag";
00562     case hsyMTag: return "MTag";
00563     case hsyEof: return "Eof";
00564     default: Fail; return TStr();
00565   }
00566 }
00567 
00568 TStr THtmlLx::GetEscapedStr(const TChA& ChA){
00569   TChA EscapedChA;
00570   for (int ChN=0; ChN<ChA.Len(); ChN++){
00571     char Ch=ChA[ChN];
00572     switch (Ch){
00573       case '"': EscapedChA+="&quot;"; break;
00574       case '&': EscapedChA+="&amp;"; break;
00575       case '\'': EscapedChA+="&apos;"; break;
00576       case '<': EscapedChA+="&lt;"; break;
00577       case '>': EscapedChA+="&gt;"; break;
00578       default: EscapedChA+=Ch;
00579     }
00580   }
00581   return EscapedChA;
00582 }
00583 
00584 TStr THtmlLx::GetAsciiStr(const TChA& ChA, const char& GenericCh){
00585   TChA AsciiChA;
00586   for (int ChN=0; ChN<ChA.Len(); ChN++){
00587     char Ch=ChA[ChN];
00588     if ((Ch<' ')||('~'<Ch)){
00589       Ch=GenericCh;}
00590     AsciiChA+=Ch;
00591   }
00592   return AsciiChA;
00593 }
00594 
00595 void THtmlLx::GetTokStrV(const TStr& Str, TStrV& TokStrV){
00596   PSIn SIn=TStrIn::New(Str);
00597   THtmlLx Lx(SIn);
00598   Lx.GetSym();
00599   TokStrV.Clr();
00600   while (Lx.Sym!=hsyEof){
00601     TokStrV.Add(Lx.ChA);
00602     Lx.GetSym();
00603   }
00604 }
00605 
00606 TStr THtmlLx::GetNoTag(const TStr& Str) {
00607   PSIn SIn=TStrIn::New(Str);
00608   THtmlLx Lx(SIn);
00609   Lx.GetSym();
00610   TChA ChA;
00611   while (Lx.Sym!=hsyEof){
00612     switch (Lx.Sym){
00613           case hsyUndef: 
00614           case hsyStr: 
00615           case hsyNum: 
00616           case hsySSym:
00617                 if (Lx.PreSpaces > 0) { ChA += ' '; }
00618                 ChA += Lx.ChA;
00619           default: break;
00620         }
00621         Lx.GetSym();
00622   }
00623   return ChA;
00624 }
00625 
00627 // Html-Token
00628 TStr THtmlTok::GetFullStr() const {
00629   if ((Sym==hsyBTag)&&(ArgNmValV.Len()>0)){
00630     TChA FullChA;
00631     FullChA+=Str.GetSubStr(0, Str.Len()-2);
00632     for (int ArgNmValN=0; ArgNmValN<ArgNmValV.Len(); ArgNmValN++){
00633       FullChA+=' '; FullChA+=ArgNmValV[ArgNmValN].Key; FullChA+='=';
00634       FullChA+='"'; FullChA+=ArgNmValV[ArgNmValN].Dat; FullChA+='"';
00635     }
00636     FullChA+='>';
00637     return FullChA;
00638   } else
00639   if (Sym==hsyETag){
00640     TChA FullChA;
00641     FullChA+='<'; FullChA+='/'; FullChA+=Str.GetSubStr(1, Str.Len()-1);
00642     return FullChA;
00643   } else {
00644     return GetStr();
00645   }
00646 }
00647 
00648 bool THtmlTok::IsUrlTok(TStr& RelUrlStr) const {
00649   if (GetSym()==hsyBTag){
00650     TStr TagNm=GetStr();
00651     if ((TagNm==ATagNm)&&(IsArg(HRefArgNm))){
00652       RelUrlStr=GetArg(HRefArgNm); return true;}
00653     else if ((TagNm==AreaTagNm)&&(IsArg(HRefArgNm))){
00654       RelUrlStr=GetArg(HRefArgNm); return true;}
00655     else if ((TagNm==FrameTagNm)&&(IsArg(SrcArgNm))){
00656       RelUrlStr=GetArg(SrcArgNm); return true;}
00657     else if ((TagNm==ImgTagNm)&&(IsArg(SrcArgNm))){
00658       RelUrlStr=GetArg(SrcArgNm); return true;}
00659     else if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
00660       TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
00661       if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
00662         TStr ContentStr=GetArg("CONTENT");
00663         TStr LeftStr; TStr RightStr; TStr UrlEqStr="URL=";
00664         ContentStr.GetUc().SplitOnStr(LeftStr, UrlEqStr, RightStr);
00665         RelUrlStr=ContentStr.GetSubStr(
00666          LeftStr.Len()+UrlEqStr.Len(), ContentStr.Len());
00667         return !RelUrlStr.Empty();
00668       } else {
00669         return false;
00670       }
00671     }
00672   }
00673   return false;
00674 }
00675 
00676 bool THtmlTok::IsRedirUrlTok() const {
00677   if (GetSym()==hsyBTag){
00678     TStr TagNm=GetStr();
00679     if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
00680       TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
00681       if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
00682         return true;
00683       } else {
00684         return false;
00685       }
00686     }
00687   }
00688   return false;
00689 }
00690 
00691 void THtmlTok::SaveTxt(const PSOut& SOut, const bool& TxtMode){
00692   if (TxtMode){
00693     SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
00694   } else {
00695     SOut->PutStr(THtmlLx::GetSymStr(Sym)); SOut->PutStr(" ");
00696     SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
00697   }
00698 }
00699 
00700 const TStr THtmlTok::ATagNm="<A>";
00701 const TStr THtmlTok::AreaTagNm="<AREA>";
00702 const TStr THtmlTok::BrTagNm="<BR>";
00703 const TStr THtmlTok::CardTagNm="<CARD>";
00704 const TStr THtmlTok::CenterTagNm="<CENTER>";
00705 const TStr THtmlTok::FrameTagNm="<FRAME>";
00706 const TStr THtmlTok::H1TagNm="<H1>";
00707 const TStr THtmlTok::H2TagNm="<H2>";
00708 const TStr THtmlTok::H3TagNm="<H3>";
00709 const TStr THtmlTok::H4TagNm="<H4>";
00710 const TStr THtmlTok::H5TagNm="<H5>";
00711 const TStr THtmlTok::H6TagNm="<H6>";
00712 const TStr THtmlTok::ImgTagNm="<IMG>";
00713 const TStr THtmlTok::LiTagNm="<LI>";
00714 const TStr THtmlTok::MetaTagNm="<META>";
00715 const TStr THtmlTok::PTagNm="<P>";
00716 const TStr THtmlTok::UlTagNm="<UL>";
00717 const TStr THtmlTok::TitleTagNm="<TITLE>";
00718 const TStr THtmlTok::TitleETagNm="</TITLE>";
00719 
00720 const TStr THtmlTok::AltArgNm="ALT";
00721 const TStr THtmlTok::HRefArgNm="HREF";
00722 const TStr THtmlTok::SrcArgNm="SRC";
00723 const TStr THtmlTok::TitleArgNm="TITLE";
00724 const TStr THtmlTok::HttpEquivArgNm="HTTP-EQUIV";
00725 
00726 bool THtmlTok::IsBreakTag(const TStr& TagNm){
00727   static TStrH BreakTagNmH(50);
00728   if (BreakTagNmH.Len()==0){
00729     BreakTagNmH.AddKey(TStr("<H1>")); BreakTagNmH.AddKey(TStr("<H2>"));
00730     BreakTagNmH.AddKey(TStr("<H3>")); BreakTagNmH.AddKey(TStr("<H4>"));
00731     BreakTagNmH.AddKey(TStr("<H5>")); BreakTagNmH.AddKey(TStr("<H6>"));
00732     BreakTagNmH.AddKey(TStr("<BR>")); BreakTagNmH.AddKey(TStr("<HR>"));
00733     BreakTagNmH.AddKey(TStr("<P>")); BreakTagNmH.AddKey(TStr("<DL>"));
00734     BreakTagNmH.AddKey(TStr("<UL>")); BreakTagNmH.AddKey(TStr("<OL>"));
00735     BreakTagNmH.AddKey(TStr("<LI>")); BreakTagNmH.AddKey(TStr("<DT>"));
00736     BreakTagNmH.AddKey(TStr("<DD>")); BreakTagNmH.AddKey(TStr("<HEAD>"));
00737     BreakTagNmH.AddKey(TStr("<TITLE>")); BreakTagNmH.AddKey(TStr("<META>"));
00738     BreakTagNmH.AddKey(TStr("<SCRIPT>"));
00739     BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<BODY>"));
00740   }
00741   return BreakTagNmH.IsKey(TagNm);
00742 }
00743 
00744 bool THtmlTok::IsBreakTok(const PHtmlTok& Tok){
00745   if ((Tok->GetSym()==hsyBTag)||(Tok->GetSym()==hsyETag)){
00746     return IsBreakTag(Tok->GetStr());
00747   } else {
00748     return false;
00749   }
00750 }
00751 
00752 bool THtmlTok::IsHTag(const TStr& TagNm, int& HTagN){
00753   if ((TagNm.Len()==4)&&(TagNm[0]=='<')&&(TagNm[1]=='H')&&(TagNm[3]=='>')){
00754     char Ch=TagNm[2];
00755     if (('1'<=Ch)&&(Ch<='6')){HTagN=Ch-'0'; return true;}
00756     else {HTagN=-1; return false;}
00757   } else {
00758     HTagN=-1; return false;
00759   }
00760 }
00761 
00762 PHtmlTok THtmlTok::GetHTok(const bool& IsBTag, const int& HTagN){
00763   THtmlLxSym HTagSym=IsBTag?hsyBTag:hsyETag;
00764   TStr HTagNm;
00765   switch (HTagN){
00766     case 1: HTagNm=H1TagNm; break;
00767     case 2: HTagNm=H2TagNm; break;
00768     case 3: HTagNm=H3TagNm; break;
00769     case 4: HTagNm=H4TagNm; break;
00770     case 5: HTagNm=H5TagNm; break;
00771     case 6: HTagNm=H6TagNm; break;
00772     default: Fail;
00773   }
00774   return PHtmlTok(new THtmlTok(HTagSym, HTagNm));
00775 }
00776 
00778 // Html-Document
00779 THtmlDoc::THtmlDoc(const PSIn& SIn, const THtmlDocType& Type, const bool& DoUc):
00780   TokV(1000, 0){
00781   THtmlLx Lx(SIn);
00782   bool MkTok=false; bool InUL=false;
00783   while (Lx.GetSym()!=hsyEof){
00784     switch (Type){
00785       case hdtAll: MkTok=true; break;
00786       case hdtStr: MkTok=(Lx.Sym==hsyStr); break;
00787       case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break;
00788       case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break;
00789       case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break;
00790       case hdtHRef:
00791         MkTok=(Lx.Sym==hsyBTag)&&
00792          ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)||
00793          (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)||
00794          (Lx.UcChA==THtmlTok::MetaTagNm));
00795         break;
00796       case hdtUL:
00797         if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;}
00798         MkTok=InUL;
00799         if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;}
00800         break;
00801       default: Fail;
00802     }
00803     if (MkTok){TokV.Add(Lx.GetTok(DoUc));}
00804   }
00805   TokV.Add(PHtmlTok(new THtmlTok(hsyEof)));
00806 }
00807 
00808 TStr THtmlDoc::GetTxtLnDoc(const TStr& HtmlStr){
00809   TChA LnDocChA;
00810   // prepare html parsing
00811   PSIn HtmlSIn=TStrIn::New(HtmlStr);
00812   THtmlLx HtmlLx(HtmlSIn);
00813   bool InScript=false;
00814   // save text
00815   while (HtmlLx.GetSym()!=hsyEof){
00816     TStr Str=HtmlLx.ChA;
00817     switch (HtmlLx.Sym){
00818       case hsyStr:
00819       case hsyNum:
00820       case hsySSym:
00821         if (InScript){break;}
00822         if (HtmlLx.PreSpaces>0){LnDocChA+=' ';}
00823         LnDocChA+=Str.CStr();
00824         break;
00825       case hsyBTag:
00826         if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
00827         if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;}
00828         break;
00829       case hsyETag:
00830         if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
00831         if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;}
00832         break;
00833       default: break;
00834     }
00835   }
00836   // return result
00837   return LnDocChA;
00838 }
00839 
00840 TStr THtmlDoc::GetTxtLnDoc(const TStr& HtmlStr, 
00841  const TStr& BaseUrlStr, const bool& OutUrlP, const bool& OutTagsP){
00842   // prepare output-string
00843   TChA OutChA; OutChA+=' ';
00844   // prepare html parsing
00845   PSIn HtmlSIn=TStrIn::New(HtmlStr);
00846   THtmlLx HtmlLx(HtmlSIn);
00847   bool InScript=false;
00848   // save text
00849   while (HtmlLx.GetSym()!=hsyEof){
00850     TStr Str=HtmlLx.ChA;
00851     switch (HtmlLx.Sym){
00852       case hsyUndef:
00853       case hsyUrl:
00854       case hsyMTag:
00855         break;
00856       case hsyStr:
00857       case hsyNum:
00858       case hsySSym:
00859         if (InScript){break;}
00860         if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}}
00861         OutChA+=Str;
00862         break;
00863       case hsyBTag:
00864         // extract tag name
00865         Str=Str.GetSubStr(1, Str.Len()-2);
00866         // process tag
00867         if (!InScript){
00868           // check script tag
00869           if (Str=="SCRIPT"){
00870             InScript=true; break;}
00871           // output tag
00872           if (OutTagsP){
00873             OutChA+='<'; OutChA+=Str; OutChA+='>';
00874           } else {
00875             if (OutChA.LastCh()!=' '){OutChA+=' ';}
00876           }
00877           // check if URL present
00878           PHtmlTok Tok=HtmlLx.GetTok();
00879           TStr RelUrlStr;
00880           if (Tok->IsUrlTok(RelUrlStr)){
00881             PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
00882             if (Url->IsOk()){
00883               if (OutUrlP){
00884                 TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
00885                 OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>";
00886               }
00887             }
00888           }
00889         }
00890         break;
00891       case hsyETag:
00892         // extract tag name
00893         Str=Str.GetSubStr(1, Str.Len()-2);
00894         // process tag
00895         if (InScript){
00896           if (Str=="SCRIPT"){
00897             InScript=false; break;}
00898         } else {
00899           if (OutTagsP){
00900             OutChA+="</"; OutChA+=Str; OutChA+='>';
00901           } else {
00902             if (OutChA.LastCh()!=' '){OutChA+=' ';}
00903           }
00904         }
00905         break;
00906       case hsyEof: break;
00907       default: Fail;
00908     }
00909   }
00910   // return string
00911   return OutChA;
00912 }
00913 
00914 
00915 void THtmlDoc::SaveTxt(const PSOut& SOut, const bool& TxtMode) const {
00916   if (TxtMode){
00917     for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);}
00918     SOut->PutLn();
00919   } else {
00920     for (int TokN=0; TokN<TokV.Len(); TokN++){
00921       SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": ");
00922       TokV[TokN]->SaveTxt(SOut);
00923       SOut->PutLn();
00924     }
00925   }
00926 }
00927 
00928 void THtmlDoc::SaveHtmlToTxt(
00929  const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr,
00930  const bool& OutUrlP, const bool& OutTagsP){
00931   // get text-string from html-string
00932   TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP);
00933   // save text-string
00934   TxtStr.SaveTxt(TxtSOut);
00935 }
00936 
00937 void THtmlDoc::SaveHtmlToTxt(
00938  const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr,
00939  const bool& OutUrlP, const bool& OutTagsP){
00940   // create output file
00941   PSOut TxtSOut=TFOut::New(TxtFNm);
00942   // save to output file
00943   SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP);
00944 }
00945 
00946 void THtmlDoc::SaveHtmlToXml(
00947  const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr,
00948  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
00949  const bool& OutTagsP, const bool& OutArgsP){
00950   // prepare output-file-id
00951   TFileId fXml=XmlSOut->GetFileId();
00952   // create outgoing url
00953   TStrV OutUrlStrV;
00954   // open top tag
00955   fprintf(fXml, "<HtmlDoc>\n");
00956   // save url
00957   if (!BaseUrlStr.Empty()){
00958     TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr);
00959     fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr());
00960   }
00961   // prepare html parsing
00962   PSIn HtmlSIn=TStrIn::New(HtmlStr);
00963   THtmlLx HtmlLx(HtmlSIn);
00964   TChA ContTextChA; bool InScript=false;
00965   // save text
00966   fprintf(fXml, "<Body>\n");
00967   while (HtmlLx.GetSym()!=hsyEof){
00968     TStr Str=HtmlLx.ChA;
00969     switch (HtmlLx.Sym){
00970       case hsyUndef:
00971       case hsyUrl:
00972       case hsyMTag:
00973         break;
00974       case hsyStr:
00975         if (InScript){break;}
00976         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
00977         if (OutToksP){
00978           fprintf(fXml, "  <Str>%s</Str>\n", Str.CStr());}
00979         if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
00980         break;
00981       case hsyNum:
00982         if (InScript){break;}
00983         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
00984         if (OutToksP){
00985           fprintf(fXml, "  <Num>%s</Num>\n", Str.CStr());}
00986         if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
00987         break;
00988       case hsySSym:
00989         if (InScript){break;}
00990         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
00991         if (OutToksP){
00992           fprintf(fXml, "  <Sym>%s</Sym>\n", Str.CStr());}
00993         if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
00994         break;
00995       case hsyBTag:{
00996         // save continuos text
00997         if (!ContTextChA.Empty()){
00998           if (OutTextP){
00999             fprintf(fXml, "  <Text>%s</Text>\n", ContTextChA.CStr());}
01000           ContTextChA.Clr();
01001         }
01002         // extract tag name
01003         Str=Str.GetSubStr(1, Str.Len()-2);
01004         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
01005         // process tag
01006         if (!InScript){
01007           // check script tag
01008           if (Str=="SCRIPT"){
01009             InScript=true; break;}
01010           // output tag
01011           if (OutTagsP){
01012             if (OutArgsP){
01013               fprintf(fXml, "  <BTag Nm=\"%s\">\n", Str.CStr());
01014               for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){
01015                 TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN));
01016                 TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN));
01017                 fprintf(fXml, "    <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr());
01018               }
01019               fprintf(fXml, "  </BTag>\n");
01020             } else {
01021               fprintf(fXml, "  <BTag Nm=\"%s\"/>\n", Str.CStr());
01022             }
01023           }
01024           // check if URL present
01025           PHtmlTok Tok=HtmlLx.GetTok();
01026           TStr RelUrlStr;
01027           if (Tok->IsUrlTok(RelUrlStr)){
01028             PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
01029             if (Url->IsOk()){
01030               OutUrlStrV.Add(Url->GetUrlStr());
01031               if (OutUrlP){
01032                 TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
01033                 fprintf(fXml, "  <Url>%s</Url>\n", XmlUrlStr.CStr());
01034               }
01035             }
01036           }
01037         }
01038         break;}
01039       case hsyETag:{
01040         // save continuos text
01041         if (!ContTextChA.Empty()){
01042           if (OutTextP){
01043             fprintf(fXml, "  <Text>%s</Text>\n", ContTextChA.CStr());}
01044           ContTextChA.Clr();
01045         }
01046         // extract tag name
01047         Str=Str.GetSubStr(1, Str.Len()-2);
01048         Str=TXmlLx::GetXmlStrFromPlainStr(Str);
01049         // process tag
01050         if (InScript){
01051           if (Str=="SCRIPT"){
01052             InScript=false; break;}
01053         } else {
01054           if (OutTagsP){
01055             fprintf(fXml, "  <ETag Nm=\"%s\"/>\n", Str.CStr());}
01056         }
01057         break;}
01058       case hsyEof: break;
01059       default: Fail;
01060     }
01061   }
01062   // save continuos text
01063   if (!ContTextChA.Empty()){
01064     if (OutTextP){
01065       fprintf(fXml, "  <Text>%s</Text>\n", ContTextChA.CStr());}
01066     ContTextChA.Clr();
01067   }
01068   fprintf(fXml, "</Body>\n");
01069   // save outgoing urls
01070   fprintf(fXml, "<OutUrls>\n");
01071   for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){
01072     TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]);
01073     fprintf(fXml, "  <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr());
01074   }
01075   fprintf(fXml, "</OutUrls>\n");
01076 
01077   // close top tag
01078   fprintf(fXml, "</HtmlDoc>\n");
01079 }
01080 
01081 void THtmlDoc::SaveHtmlToXml(
01082  const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr,
01083  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
01084  const bool& OutTagsP, const bool& OutArgsP){
01085   // create output file
01086   PSOut XmlSOut=TFOut::New(XmlFNm);
01087   // save to output file
01088   SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP,
01089    OutToksP, OutTagsP, OutArgsP);
01090 }
01091 
01092 TLxSym THtmlDoc::GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA){
01093   switch (HtmlLxSym){
01094     case hsyUndef: return syUndef;
01095     case hsyStr: return syStr;
01096     case hsyNum: return syFlt;
01097     case hsySSym: return TLxSymStr::GetSSym(ChA);
01098     case hsyUrl: return syStr;
01099     case hsyBTag: return syStr;
01100     case hsyETag: return syStr;
01101     case hsyEof: return syEof;
01102     default: Fail; return syUndef;
01103   }
01104 }
01105 
01106 bool THtmlDoc::_IsTagRedir(
01107  const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx,
01108  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
01109   IAssert(Lx.Sym==hsyBTag);
01110   if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){
01111     TStr RelUrlStr=Lx.GetArg(ArgNm);
01112     PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
01113     if (Url->IsOk(usHttp)){
01114       TStr UrlStr=Url->GetUrlStr();
01115       PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr);
01116       Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr());
01117       return true;
01118     } else {
01119       return false;
01120     }
01121   } else {
01122     return false;
01123   }
01124 }
01125 
01126 TStr THtmlDoc::GetRedirHtmlDocStr(const TStr& HtmlStr,
01127  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
01128   PSIn SIn=TStrIn::New(HtmlStr);
01129   TMOut SOut;
01130   THtmlLx Lx(SIn);
01131   while (Lx.GetSym()!=hsyEof){
01132     SOut.PutStr(Lx.PreSpaceChA);
01133     if ((Lx.Sym==hsyBTag)&&(
01134      (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
01135      (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
01136      (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))||
01137      (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){
01138       SOut.PutStr(Lx.GetFullBTagStr());
01139     } else {
01140       SOut.PutStr(Lx.SymChA());
01141     }
01142   }
01143   return SOut.GetAsStr();
01144 }
01145 
01147 // Html-Hyper-Link-Document-Vector
01148 THtmlHldV::THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen):
01149   RefHtmlDoc(_RefHtmlDoc), HldV(){
01150   bool IsTitleAct=false; THtmlTokV TitleTokV;
01151   bool IsHAct=false; int ActHTagN=-1;
01152   TVec<THtmlTokV> HTokV(6);
01153   PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
01154   for (int TokN=0; TokN<RefHtmlDoc->GetToks(); TokN++){
01155     Tok=RefHtmlDoc->GetTok(TokN, TokSym, TokStr);
01156     if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
01157       // collect tokens before, inside and after <a> ... </a> tags
01158       int ATokN; PHtmlTok ATok; THtmlLxSym ATokSym; TStr ATokStr;
01159       // inside <A> tags
01160       THtmlTokV ATokV; ATokN=TokN;
01161       forever{
01162         ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
01163         if (ATokSym!=hsySSym){ATokV.Add(ATok);}
01164         if ((ATokSym==hsyETag)&&(ATokStr==THtmlTok::ATagNm)){break;}
01165         ATokN++;
01166         if (ATokN>=RefHtmlDoc->GetToks()){break;}
01167       }
01168       int ETagATokN=ATokN+1;
01169       // before <A> tags
01170       THtmlTokV PrevATokV; ATokN=TokN;
01171       forever{
01172         ATokN--;
01173         if (ATokN<0){break;}
01174         ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
01175         if (THtmlTok::IsBreakTok(ATok)){break;}
01176         if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){PrevATokV.Add(ATok);}
01177         if (ATokV.Len()>=HldWnLen){break;}
01178       }
01179       // after <A> tags
01180       THtmlTokV NextATokV; ATokN=ETagATokN;
01181       forever{
01182         ATokN++;
01183         if (ATokN>=RefHtmlDoc->GetToks()){break;}
01184         ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
01185         if (THtmlTok::IsBreakTok(ATok)){break;}
01186         if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){NextATokV.Add(ATok);}
01187         if (ATokV.Len()>=HldWnLen){break;}
01188       }
01189       // construct html-document with hyper-link context
01190       PHtmlDoc HtmlDoc=PHtmlDoc(new THtmlDoc());
01191       HtmlDoc->AddTokV(TitleTokV);
01192       for (int HTagN=1; HTagN<=6; HTagN++){HtmlDoc->AddTokV(HTokV[HTagN-1]);}
01193       HtmlDoc->AddTokV(PrevATokV);
01194       HtmlDoc->AddTokV(ATokV);
01195       HtmlDoc->AddTokV(NextATokV);
01196       HldV.Add(HtmlDoc);
01197       HtmlDoc->SaveTxt(TSOut::StdOut);
01198     } else
01199     if (TokSym==hsyBTag){
01200       int HTagN;
01201       if (TokStr==THtmlTok::TitleTagNm){
01202         IsTitleAct=true; TitleTokV.Clr(); TitleTokV.Add(Tok);
01203       } else
01204       if (THtmlTok::IsHTag(TokStr, HTagN)){
01205         if (IsHAct){// conclude previous <H?> tag if left open
01206           HTokV[ActHTagN-1].Add(THtmlTok::GetHTok(false, ActHTagN));}
01207         IsHAct=true; ActHTagN=HTagN;
01208         {for (int HTagN=ActHTagN; HTagN<=6; HTagN++){HTokV[HTagN-1].Clr();}}
01209         HTokV[ActHTagN-1].Add(Tok);
01210       }
01211     } else
01212     if (TokSym==hsyETag){
01213       int HTagN;
01214       if (TokStr==THtmlTok::TitleTagNm){
01215         if (IsTitleAct){TitleTokV.Add(Tok); IsTitleAct=false;}
01216       } else
01217       if (THtmlTok::IsHTag(TokStr, HTagN)){
01218         if (IsHAct){HTokV[ActHTagN-1].Add(Tok); IsHAct=false;}
01219       }
01220     } else
01221     if (TokSym!=hsySSym){
01222       if (IsTitleAct){TitleTokV.Add(Tok);}
01223       if (IsHAct){HTokV[ActHTagN-1].Add(Tok);}
01224     }
01225   }
01226 }
01227 
01229 // Web-Page
01230 void TWebPg::GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const {
01231   // create outgoing url vector
01232   OutUrlV.Clr(); OutRedirUrlV.Clr();
01233   // take interesting web-page components
01234   TStr UrlStr=GetUrlStr();
01235   TStr HtmlStr=GetHttpBodyAsStr();
01236   // prepare html parsing
01237   PSIn HtmlSIn=TStrIn::New(HtmlStr);
01238   PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
01239   PHtmlTok Tok;
01240   // traverse html
01241   for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
01242     PHtmlTok Tok=HtmlDoc->GetTok(TokN);
01243     if (Tok->GetSym()==hsyBTag){
01244       TStr RelUrlStr;
01245       if (Tok->IsUrlTok(RelUrlStr)){
01246         PUrl Url=TUrl::New(RelUrlStr, UrlStr);
01247         if (Url->IsOk(usHttp)){
01248           OutUrlV.Add(Url);
01249           if (Tok->IsRedirUrlTok()){
01250             OutRedirUrlV.Add(Url);
01251           }
01252         }
01253       }
01254     }
01255   }
01256 }
01257 
01258 void TWebPg::GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const {
01259   // create outgoing url vector
01260   OutDescUrlStrKdV.Clr();
01261   // take interesting web-page components
01262   TStr UrlStr=GetUrlStr();
01263   TStr HtmlStr=GetHttpBodyAsStr();
01264   // prepare html parsing
01265   PSIn HtmlSIn=TStrIn::New(HtmlStr);
01266   PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
01267   // traverse html documents
01268   PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
01269   int TokN=0; int Toks=HtmlDoc->GetToks();
01270   while (TokN<Toks){
01271     Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
01272     if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
01273       TStr RelUrlStr;
01274       if (Tok->IsUrlTok(RelUrlStr)){
01275         PUrl Url=TUrl::New(RelUrlStr, UrlStr);
01276         if (Url->IsOk()){
01277           TChA DescChA;
01278           while (TokN<Toks){
01279             Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
01280             if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
01281               break;
01282             } else {
01283               if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
01284                 if (!DescChA.Empty()){DescChA+=' ';}
01285                 DescChA+=TokStr;
01286               }
01287             }
01288           }
01289           OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
01290         }
01291       }
01292     }
01293   }
01294 }
01295 
01296 void TWebPg::SaveAsHttpBody(const TStr& FNm) const {
01297   // create output file
01298   PSOut SOut=TFOut::New(FNm);
01299   // save http-body
01300   HttpResp->SaveBody(SOut);
01301 }
01302 
01303 void TWebPg::SaveAsHttp(const TStr& FNm) const {
01304   // create output file
01305   PSOut SOut=TFOut::New(FNm);
01306   // save http
01307   HttpResp->SaveTxt(SOut);
01308 }
01309 
01310 bool TWebPg::IsTxt() const {
01311   if ((!HttpResp->IsContType())||HttpResp->IsContType(THttp::TextFldVal)){
01312     TStr Str=HttpResp->GetBodyAsStr();
01313     int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
01314     while ((ChN<100)&&(ChN<StrLen)){
01315       char Ch=Str[ChN++];
01316       if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
01317         PrintChs++;}
01318     }
01319     double PrintPrb=double(PrintChs)/double(ChN+1);
01320     return PrintPrb>0.9;
01321   } else {
01322     return false;
01323   }
01324 }
01325