SNAP Library 4.0, Developer Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
html.cpp
Go to the documentation of this file.
1 // Html-Lexical-Chars
3 void THtmlLxChDef::SetUcCh(const char& UcCh, const char& LcCh){
4  // update upper-case (more lower cases may have one upper case)
5  IAssert(
6  (UcChV[LcCh-TCh::Mn]==TCh(0))||
7  (UcChV[LcCh-TCh::Mn]==TCh(LcCh)));
8  UcChV[LcCh-TCh::Mn]=TCh(UcCh);
9  // update lower-case (one upper case may have only one lower case)
10  if ((LcChV[UcCh-TCh::Mn]==TCh(0))||(LcChV[UcCh-TCh::Mn]==TCh(UcCh))){
11  LcChV[UcCh-TCh::Mn]=TCh(LcCh);
12  }
13 }
14 
15 void THtmlLxChDef::SetUcCh(const TStr& Str){
16  // set type of characters as letters
17  SetChTy(hlctAlpha, Str);
18  // first char in string is upper-case, rest are lower-case
19  for (int ChN=1; ChN<Str.Len(); ChN++){
20  SetUcCh(Str[0], Str[ChN]);
21  }
22 }
23 
24 void THtmlLxChDef::SetChTy(const THtmlLxChTy& ChTy, const TStr& Str){
25  for (int ChN=0; ChN<Str.Len(); ChN++){
26  ChTyV[Str[ChN]-TCh::Mn]=TInt(ChTy);}
27 }
28 
29 void THtmlLxChDef::SetEscStr(const TStr& SrcStr, const TStr& DstStr){
30  EscStrH.AddDat(SrcStr, DstStr);
31 }
32 
33 TStr THtmlLxChDef::GetEscStr(const TStr& Str) const {
34  int EscStrId;
35  if ((EscStrId=EscStrH.GetKeyId(Str))!=-1){
36  return EscStrH[EscStrId];
37  } else
38  if ((Str.Len()>=2)&&(Str[0]=='&')&&(Str[1]=='#')){
39  int ChCd=0;
40  for (int ChN=2; ChN<Str.Len(); ChN++){
41  if (ChCd<=0xFFFF){ChCd=ChCd*10+Str[ChN]-'0';}}
42  return TStr((char)ChCd);
43  } else {
44  return TStr(' ');
45  }
46 }
47 
49  ChTyV(TCh::Vals), UcChV(TCh::Vals), LcChV(TCh::Vals), EscStrH(100){
50 
51  // Character-Types
53  SetChTy(hlctAlpha, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
54  SetChTy(hlctAlpha, "abcdefghijklmnopqrstuvwxyz");
55  SetChTy(hlctAlpha, "@_");
56  SetChTy(hlctNum, "0123456789");
57  SetChTy(hlctSym, "`~!#$%^&*()-=+[{]}\\|;:'\",<.>/?");
58  SetChTy(hlctLTag, "<"); SetChTy(hlctRTag, ">");
60  for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
61  if ((Ch<0)||(127<Ch)){SetChTy(hlctAlpha, TStr(TCh(char(Ch))));}}
62  //SetChTy(hlctSpace, TStr(TCh(char(160))));
63 
64  // Upper-Case
65  {for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
66  SetUcCh(char(Ch), char(Ch));}}
67  SetUcCh("Aa"); SetUcCh("\xc0\xe0"); SetUcCh("\xc1\xe1"); SetUcCh("\xc2\xe2");
68  SetUcCh("\xc3\xe3"); SetUcCh("\xc4\xe4"); SetUcCh("\xc5\xe5"); SetUcCh("\xc6\xe6");
69  SetUcCh("Bb"); SetUcCh("Cc"); SetUcCh("\xc7\xe7"); SetUcCh("Dd");
70  SetUcCh("\xd0\xf0"); SetUcCh("Ee"); SetUcCh("\xc8\xe8"); SetUcCh("\xc9\xe9");
71  SetUcCh("\xca\xea"); SetUcCh("\xcb\xeb"); SetUcCh("Ff"); SetUcCh("Gg");
72  SetUcCh("Hh"); SetUcCh("Ii"); SetUcCh("\xcc\xec"); SetUcCh("\xcd\xed");
73  SetUcCh("\xce\xee"); SetUcCh("\xcf\xef"); SetUcCh("Jj"); SetUcCh("Kk");
74  SetUcCh("Ll"); SetUcCh("Mm"); SetUcCh("Nn"); SetUcCh("\xd1\xf1");
75  SetUcCh("Oo"); SetUcCh("\xd2\xf2"); SetUcCh("\xd3\xf3"); SetUcCh("\xd4\xf4");
76  SetUcCh("\xd5\xf5"); SetUcCh("\xd6\xf6"); SetUcCh("\xd8\xf8"); SetUcCh("Pp");
77  SetUcCh("Qq"); SetUcCh("Rr"); SetUcCh("Ss"); SetUcCh("\x8a\x9a");
78  SetUcCh("Tt"); SetUcCh("Uu"); SetUcCh("\xd9\xf9"); SetUcCh("\xda\xfa");
79  SetUcCh("\xdb\xfb"); SetUcCh("\xdc\xfc"); SetUcCh("Vv"); SetUcCh("Ww");
80  SetUcCh("Xx"); SetUcCh("Yy\xff"); SetUcCh("\xdd\xfd"); SetUcCh("Zz");
81  SetUcCh("\x8e\x9e");
82  // ISO-CE
83  //SetUcCh(uchar(169), uchar(185)); /*Sh - \xa9\xb9*/
84  //SetUcCh(uchar(174), uchar(190)); /*Zh - \xae\xbe*/
85  //SetUcCh(uchar(200), uchar(232)); /*Ch - \xc8\xe8*/
86  //SetUcCh(uchar(198), uchar(230)); /*Cs - \xc6\xe6*/
87  //SetUcCh(uchar(208), uchar(240)); /*Dz - \xd0\xf0*/
88 
89  // Annoying Unicode-characters
90  //SetChTy(hlctSpace, "\xc2\xef");
91 
92  // Escape-Sequences
93  SetEscStr("&quot", "\""); SetEscStr("&amp", "&");
94  SetEscStr("&lt", "<"); SetEscStr("&gt", ">");
95  SetEscStr("&nbsp", " ");
96 
97  SetEscStr("&auml", "\xe4"); SetEscStr("&Auml", "\xc4");
98  SetEscStr("&ouml", "\xf6"); SetEscStr("&Ouml", "\xd6");
99  SetEscStr("&uuml", "\xfc"); SetEscStr("&Uuml", "\xdc");
100  SetEscStr("&aring", "\xe5"); SetEscStr("&Aring", "\xc5");
101  SetEscStr("&oslash", "\xf8"); SetEscStr("&Oslash", "\xd8");
102  SetEscStr("&Aelig", "\xc6"); SetEscStr("&aelig", "\xe6");
103 
104  SetEscStr("&eacute", "e"); SetEscStr("&Eacute", "E");
105  SetEscStr("&egrave", "e"); SetEscStr("&Egrave", "E");
106  SetEscStr("&agrave", "a"); SetEscStr("&Agrave", "A");
107 }
108 
110 
112  TChA DstChA;
113  for (int ChN=0; ChN<ChA.Len(); ChN++){
114  char Ch=ChA[ChN];
115  switch (Ch){
116  case '~': DstChA+='c'; break;
117  case '^': DstChA+='C'; break;
118  case '}': DstChA+='c'; break;
119  case ']': DstChA+='C'; break;
120  case '|': DstChA+='d'; break;
121  case '\\': DstChA+='D'; break;
122  case '{': DstChA+='s'; break;
123  case '[': DstChA+='S'; break;
124  case '`': DstChA+='z'; break;
125  case '@': DstChA+='Z'; break;
126  default: DstChA+=Ch;
127  }
128  }
129  return DstChA;
130 }
131 
133  TChA DstChA;
134  for (int ChN=0; ChN<ChA.Len(); ChN++){
135  const uchar Ch=ChA[ChN];
136  switch (Ch){
137  case 232: DstChA+='c'; break;
138  case 200: DstChA+='C'; break;
139  case 154: DstChA+='s'; break;
140  case 138: DstChA+='S'; break;
141  case 158: DstChA+='z'; break;
142  case 142: DstChA+='Z'; break;
143  default: DstChA+=Ch;
144  }
145  }
146  return DstChA;
147 }
148 
150  TChA DstChA;
151  for (int ChN=0; ChN<ChA.Len(); ChN++){
152  char Ch=ChA[ChN];
153  switch (Ch){
154  case '~': DstChA+=uchar(232); break;
155  case '^': DstChA+=uchar(200); break;
156  case '}': DstChA+='c'; break;
157  case ']': DstChA+='C'; break;
158  case '|': DstChA+='d'; break;
159  case '\\': DstChA+='D'; break;
160  case '{': DstChA+=uchar(154); break;
161  case '[': DstChA+=uchar(138); break;
162  case '`': DstChA+=uchar(158); break;
163  case '@': DstChA+=uchar(142); break;
164  default: DstChA+=Ch;
165  }
166  }
167  return DstChA;
168 }
169 
171  TChA DstChA;
172  for (int ChN=0; ChN<ChA.Len(); ChN++){
173  char Ch=ChA[ChN];
174  switch (Ch){
175  case '~': DstChA+=uchar(232); break;
176  case '^': DstChA+=uchar(200); break;
177  case '}': DstChA+=uchar(230); break;
178  case ']': DstChA+=uchar(198); break;
179  case '|': DstChA+=uchar(240); break;
180  case '\\': DstChA+=uchar(208); break;
181  case '{': DstChA+=uchar(185); break;
182  case '[': DstChA+=uchar(169); break;
183  case '`': DstChA+=uchar(190); break;
184  case '@': DstChA+=uchar(174); break;
185  default: DstChA+=Ch;
186  }
187  }
188  return DstChA;
189 }
190 
192 // Html-Lexical
194 
196  GetCh();
197  EscCh=(Ch=='&');
198  if (EscCh){
199  EscChA.Clr(); EscChA.AddCh(Ch); GetCh();
200  if (Ch=='#'){
201  EscChA.AddCh(Ch); GetCh();
202  if (('0'<=Ch)&&(Ch<='9')){
203  do {EscChA.AddCh(Ch); GetCh();} while (('0'<=Ch)&&(Ch<='9'));
204  if (Ch==';'){GetCh();}
206  } else {
207  PutCh('#'); PutCh('&');
208  }
209  } else
210  if ((('a'<=Ch)&&(Ch<='z'))||(('A'<=Ch)&&(Ch<='Z'))){
211  do {
212  EscChA.AddCh(Ch); GetCh();
213  } while ((('A'<=Ch)&&(Ch<='Z'))||(('a'<=Ch)&&(Ch<='z'))||(('0'<=Ch)&&(Ch<='9')));
214  if (Ch==';'){
216  } else {
217  PutStr(EscChA);
218  }
219  } else {
220  PutCh('&');
221  }
222  }
223 }
224 
226  Sym=hsyMTag;
227  if (Ch=='-'){
228  char PCh=' ';
229  while ((Ch!=TCh::EofCh) && ((PCh!='-')||(Ch!='>'))){PCh=Ch; GetCh();}
230  } else {
231  while ((Ch!=TCh::EofCh) && (Ch!='>')){GetCh();}
232  }
233  if (Ch!=TCh::EofCh){GetEscCh();}
234 }
235 
237  if (Ch=='/'){Sym=hsyETag; GetCh();} else {Sym=hsyBTag;}
238  UcChA.AddCh('<');
239  while (ChDef.IsAlNum(Ch)||(Ch==':')){
240  UcChA.AddCh(ChDef.GetUc(Ch)); GetCh();}
241  UcChA.AddCh('>');
242  ChA=UcChA;
243 
244  if (DoParseArg){
245  while ((Ch!='>')&&(Ch!=TCh::EofCh)){
246  while ((!ChDef.IsAlpha(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
247  if (ChDef.IsAlpha(Ch)){
248  ArgNm.Clr(); ArgVal.Clr();
249  while (ChDef.IsAlNum(Ch)||(Ch=='-')){ArgNm.AddCh(ChDef.GetUc(Ch)); GetCh();}
250  while (ChDef.IsWs(Ch)){GetCh();}
251  if (Ch=='='){
252  GetCh(); while (ChDef.IsWs(Ch)){GetCh();}
253  if (Ch=='"'){
254  GetCh();
255  while ((Ch!=TCh::EofCh)&&(Ch!='"')&&(Ch!='>')){
256  if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
257  if (Ch=='"'){GetCh();}
258  } else if (Ch=='\''){
259  GetCh();
260  while ((Ch!=TCh::EofCh)&&(Ch!='\'')&&(Ch!='>')){
261  if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
262  if (Ch=='\''){GetCh();}
263  } else {
264  while ((!ChDef.IsWs(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){
265  ArgVal.AddCh(Ch); GetCh();}
266  }
268  }
269  }
270  }
271  } else {
272  while ((Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
273  }
274  if (Ch!=TCh::EofCh){GetEscCh();}
275 }
276 
278  // prepare symbol descriptions
279  ChA.Clr(); UcChA.Clr();
281  ArgNmValV.Clr();
282  // skip white-space
283  while (ChDef.IsSpace(Ch)){
284  if (ChX>0){PreSpaceChA+=Ch; PreSpaces++;} GetEscCh();}
285  // parse symbol
286  SymChA.Clr(); SymChA+=Ch; SymBChX=ChX;
287  switch (ChDef.GetChTy(Ch)){
288  case hlctAlpha:
289  Sym=hsyStr;
290  forever{
291  do {
293  } while (ChDef.IsAlNum(Ch));
294  if (Ch=='.'){
295  GetCh();
296  if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
297  else {PutCh(Ch); Ch='.'; break;}
298  } else {break;}
299  }
300  break;
301  case hlctNum:
302  Sym=hsyNum;
303  forever{
304  do {
305  ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
306  } while (ChDef.IsNum(Ch));
307  if (Ch=='.'){
308  GetCh();
309  if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
310  else {PutCh(Ch); Ch='.'; break;}
311  } else if (ChDef.IsAlpha(Ch)){
312  Sym=hsyStr;
313  } else {
314  break;
315  }
316  }
317  break;
318  case hlctSym:
320  if ((ChA.LastCh()=='.')&&(ChDef.IsAlNum(Ch))){
321  Sym=hsyStr;
322  do {
324  } while (ChDef.IsAlNum(Ch));
325  }
326  break;
327  case hlctLTag:
328  if (EscCh){
330  } else {
331  GetCh();
332  if (Ch=='!'){GetCh(); GetMetaTag();} else {GetTag();}
333  }
334  break;
335  case hlctRTag:
336  if (EscCh){
338  } else {
340  }
341  break;
342  case hlctEof: Sym=hsyEof; break;
343  default: Sym=hsyUndef; GetEscCh();
344  }
345  // set symbol last-character-position
346  SymEChX=ChX-1;
347  // delete last character
348  if (!SymChA.Empty()){SymChA.Pop();}
349  // return symbol
350  return Sym;
351 }
352 
353 PHtmlTok THtmlLx::GetTok(const bool& DoUc){
354  if (DoUc){return PHtmlTok(new THtmlTok(Sym, UcChA, ArgNmValV));}
355  else {return PHtmlTok(new THtmlTok(Sym, ChA, ArgNmValV));}
356 }
357 
359  IAssert(Sym==hsyBTag);
360  TChA BTagChA;
361  BTagChA+=ChA; BTagChA.Pop();
362  for (int ArgN=0; ArgN<GetArgs(); ArgN++){
363  BTagChA+=' '; BTagChA+=GetArgNm(ArgN);
364  BTagChA+='='; BTagChA+='"'; BTagChA+=GetArgVal(ArgN); BTagChA+='"';
365  }
366  BTagChA+='>';
367  return BTagChA;
368 }
369 
370 void THtmlLx::MoveToStrOrEof(const TStr& Str){
371  do {
372  GetSym();
373  } while ((Sym!=hsyEof)&&((Sym!=hsyStr)||(ChA!=Str)));
374 }
375 
376 void THtmlLx::MoveToBTagOrEof(const TStr& TagNm){
377  do {
378  GetSym();
379  } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||(UcChA!=TagNm)));
380 }
381 
382 void THtmlLx::MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2){
383  do {
384  GetSym();
385  } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2))));
386 }
387 
388 void THtmlLx::MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3){
389  do {
390  GetSym();
391  } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2)&&(UcChA!=TagNm3))));
392 }
393 
394 void THtmlLx::MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm){
395  do {
396  GetSym();
397  } while ((Sym!=hsyEof) && ((Sym!=hsyBTag)||(UcChA!=BTagNm)) && ((Sym!=hsyETag) || (UcChA!=ETagNm)));
398 }
399 
401  const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal){
402  forever {
403  GetSym();
404  if (Sym==hsyEof){break;}
405  if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
406  (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal)){break;}
407  }
408 }
409 
411  const TStr& ArgNm1, const TStr& ArgVal1,
412  const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP){
413  forever {
414  GetSym();
415  if (Sym==hsyEof){break;}
416  if (AndOpP){
417  if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
418  (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)&&
419  (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
420  } else {
421  if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
422  (((IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1))||
423  ((IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)))){break;}
424  }
425  }
426 }
427 
429  const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1,
430  const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2){
431  forever {
432  GetSym();
433  if (Sym==hsyEof){break;}
434  if ((Sym==hsyBTag)&&(UcChA==TagNm1)&&
435  (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)){break;}
436  if ((Sym==hsyBTag)&&(UcChA==TagNm2)&&
437  (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
438  }
439 }
440 
441 void THtmlLx::MoveToETagOrEof(const TStr& TagNm){
442  do {
443  GetSym();
444  } while ((Sym!=hsyEof)&&((Sym!=hsyETag)||(UcChA!=TagNm)));
445 }
446 
448  TChA OutChA;
449  forever {
450  GetSym();
451  if (Sym==hsyEof){
452  break;
453  } else {
454  if (PreSpaces>0){OutChA+=' ';}
455  if ((Sym!=hsyBTag)&&(Sym!=hsyETag)){
456  OutChA+=ChA;}
457  }
458  }
459  return OutChA;
460 }
461 
462 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP){
463  TChA OutChA;
464  forever {
465  GetSym();
466  if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm))){
467  break;
468  } else {
469  if (PreSpaces>0){OutChA+=' ';}
470  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
471  OutChA+=ChA;}
472  }
473  }
474  return OutChA;
475 }
476 
477 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const TStr& ArgNm,
478  const TStr& ArgVal, const bool& TxtOnlyP){
479  TChA OutChA;
480  forever {
481  GetSym();
482  if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm)&&
483  (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal))){
484  break;
485  } else {
486  if (PreSpaces>0){OutChA+=' ';}
487  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
488  OutChA+=ChA;}
489  }
490  }
491  return OutChA;
492 }
493 
494 TStr THtmlLx::GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP){
495  TChA OutChA;
496  forever {
497  GetSym();
498  if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm))){
499  break;
500  } else {
501  if (PreSpaces>0){OutChA+=' ';}
502  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
503  OutChA+=ChA;}
504  }
505  }
506  return OutChA;
507 }
508 
510  const TStr& TagNm2, const bool& TxtOnlyP){
511  TChA OutChA;
512  forever {
513  GetSym();
514  if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm1))||((Sym==hsyETag)&&(UcChA==TagNm2))){
515  break;
516  } else {
517  if (PreSpaces>0){OutChA+=' ';}
518  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
519  OutChA+=ChA;}
520  }
521  }
522  return OutChA;
523 }
524 
525 TStr THtmlLx::GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP){
526  MoveToBTagOrEof(TagNm);
527  return GetStrToETag(TagNm, TxtOnlyP);
528 }
529 
531  TStr HRefStr;
532  forever {
533  GetSym();
534  if (Sym==hsyEof){HRefStr=""; break;}
535  if ((Sym==hsyBTag)&&(UcChA=="<A>")){HRefStr=GetArg("HREF");}
536  if ((Sym==hsyStr)&&(ChA==Str)){break;}
537  }
538  return HRefStr;
539 }
540 
541 bool THtmlLx::IsGetBTag(const TStr& TagNm){
542  if (GetSym()==hsyBTag){
543  return ChA==TagNm;
544  } else {return false;}
545 }
546 
547 bool THtmlLx::IsGetETag(const TStr& TagNm){
548  if (GetSym()==hsyETag){
549  return ChA==TagNm;
550  } else {return false;}
551 }
552 
554  switch (Sym){
555  case hsyUndef: return "Undef";
556  case hsyStr: return "Str";
557  case hsyNum: return "Num";
558  case hsySSym: return "SSym";
559  case hsyUrl: return "Url";
560  case hsyBTag: return "BTag";
561  case hsyETag: return "ETag";
562  case hsyMTag: return "MTag";
563  case hsyEof: return "Eof";
564  default: Fail; return TStr();
565  }
566 }
567 
569  TChA EscapedChA;
570  for (int ChN=0; ChN<ChA.Len(); ChN++){
571  char Ch=ChA[ChN];
572  switch (Ch){
573  case '"': EscapedChA+="&quot;"; break;
574  case '&': EscapedChA+="&amp;"; break;
575  case '\'': EscapedChA+="&apos;"; break;
576  case '<': EscapedChA+="&lt;"; break;
577  case '>': EscapedChA+="&gt;"; break;
578  default: EscapedChA+=Ch;
579  }
580  }
581  return EscapedChA;
582 }
583 
584 TStr THtmlLx::GetAsciiStr(const TChA& ChA, const char& GenericCh){
585  TChA AsciiChA;
586  for (int ChN=0; ChN<ChA.Len(); ChN++){
587  char Ch=ChA[ChN];
588  if ((Ch<' ')||('~'<Ch)){
589  Ch=GenericCh;}
590  AsciiChA+=Ch;
591  }
592  return AsciiChA;
593 }
594 
595 void THtmlLx::GetTokStrV(const TStr& Str, TStrV& TokStrV){
596  PSIn SIn=TStrIn::New(Str);
597  THtmlLx Lx(SIn);
598  Lx.GetSym();
599  TokStrV.Clr();
600  while (Lx.Sym!=hsyEof){
601  TokStrV.Add(Lx.ChA);
602  Lx.GetSym();
603  }
604 }
605 
607  PSIn SIn=TStrIn::New(Str);
608  THtmlLx Lx(SIn);
609  Lx.GetSym();
610  TChA ChA;
611  while (Lx.Sym!=hsyEof){
612  switch (Lx.Sym){
613  case hsyUndef:
614  case hsyStr:
615  case hsyNum:
616  case hsySSym:
617  if (Lx.PreSpaces > 0) { ChA += ' '; }
618  ChA += Lx.ChA;
619  default: break;
620  }
621  Lx.GetSym();
622  }
623  return ChA;
624 }
625 
627 // Html-Token
629  if ((Sym==hsyBTag)&&(ArgNmValV.Len()>0)){
630  TChA FullChA;
631  FullChA+=Str.GetSubStr(0, Str.Len()-2);
632  for (int ArgNmValN=0; ArgNmValN<ArgNmValV.Len(); ArgNmValN++){
633  FullChA+=' '; FullChA+=ArgNmValV[ArgNmValN].Key; FullChA+='=';
634  FullChA+='"'; FullChA+=ArgNmValV[ArgNmValN].Dat; FullChA+='"';
635  }
636  FullChA+='>';
637  return FullChA;
638  } else
639  if (Sym==hsyETag){
640  TChA FullChA;
641  FullChA+='<'; FullChA+='/'; FullChA+=Str.GetSubStr(1, Str.Len()-1);
642  return FullChA;
643  } else {
644  return GetStr();
645  }
646 }
647 
648 bool THtmlTok::IsUrlTok(TStr& RelUrlStr) const {
649  if (GetSym()==hsyBTag){
650  TStr TagNm=GetStr();
651  if ((TagNm==ATagNm)&&(IsArg(HRefArgNm))){
652  RelUrlStr=GetArg(HRefArgNm); return true;}
653  else if ((TagNm==AreaTagNm)&&(IsArg(HRefArgNm))){
654  RelUrlStr=GetArg(HRefArgNm); return true;}
655  else if ((TagNm==FrameTagNm)&&(IsArg(SrcArgNm))){
656  RelUrlStr=GetArg(SrcArgNm); return true;}
657  else if ((TagNm==ImgTagNm)&&(IsArg(SrcArgNm))){
658  RelUrlStr=GetArg(SrcArgNm); return true;}
659  else if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
660  TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
661  if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
662  TStr ContentStr=GetArg("CONTENT");
663  TStr LeftStr; TStr RightStr; TStr UrlEqStr="URL=";
664  ContentStr.GetUc().SplitOnStr(LeftStr, UrlEqStr, RightStr);
665  RelUrlStr=ContentStr.GetSubStr(
666  LeftStr.Len()+UrlEqStr.Len(), ContentStr.Len());
667  return !RelUrlStr.Empty();
668  } else {
669  return false;
670  }
671  }
672  }
673  return false;
674 }
675 
677  if (GetSym()==hsyBTag){
678  TStr TagNm=GetStr();
679  if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
680  TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
681  if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
682  return true;
683  } else {
684  return false;
685  }
686  }
687  }
688  return false;
689 }
690 
691 void THtmlTok::SaveTxt(const PSOut& SOut, const bool& TxtMode){
692  if (TxtMode){
693  SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
694  } else {
695  SOut->PutStr(THtmlLx::GetSymStr(Sym)); SOut->PutStr(" ");
696  SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
697  }
698 }
699 
700 const TStr THtmlTok::ATagNm="<A>";
701 const TStr THtmlTok::AreaTagNm="<AREA>";
702 const TStr THtmlTok::BrTagNm="<BR>";
703 const TStr THtmlTok::CardTagNm="<CARD>";
704 const TStr THtmlTok::CenterTagNm="<CENTER>";
705 const TStr THtmlTok::FrameTagNm="<FRAME>";
706 const TStr THtmlTok::H1TagNm="<H1>";
707 const TStr THtmlTok::H2TagNm="<H2>";
708 const TStr THtmlTok::H3TagNm="<H3>";
709 const TStr THtmlTok::H4TagNm="<H4>";
710 const TStr THtmlTok::H5TagNm="<H5>";
711 const TStr THtmlTok::H6TagNm="<H6>";
712 const TStr THtmlTok::ImgTagNm="<IMG>";
713 const TStr THtmlTok::LiTagNm="<LI>";
714 const TStr THtmlTok::MetaTagNm="<META>";
715 const TStr THtmlTok::PTagNm="<P>";
716 const TStr THtmlTok::UlTagNm="<UL>";
717 const TStr THtmlTok::TitleTagNm="<TITLE>";
718 const TStr THtmlTok::TitleETagNm="</TITLE>";
719 
720 const TStr THtmlTok::AltArgNm="ALT";
721 const TStr THtmlTok::HRefArgNm="HREF";
722 const TStr THtmlTok::SrcArgNm="SRC";
723 const TStr THtmlTok::TitleArgNm="TITLE";
724 const TStr THtmlTok::HttpEquivArgNm="HTTP-EQUIV";
725 
726 bool THtmlTok::IsBreakTag(const TStr& TagNm){
727  static TStrH BreakTagNmH(50);
728  if (BreakTagNmH.Len()==0){
729  BreakTagNmH.AddKey(TStr("<H1>")); BreakTagNmH.AddKey(TStr("<H2>"));
730  BreakTagNmH.AddKey(TStr("<H3>")); BreakTagNmH.AddKey(TStr("<H4>"));
731  BreakTagNmH.AddKey(TStr("<H5>")); BreakTagNmH.AddKey(TStr("<H6>"));
732  BreakTagNmH.AddKey(TStr("<BR>")); BreakTagNmH.AddKey(TStr("<HR>"));
733  BreakTagNmH.AddKey(TStr("<P>")); BreakTagNmH.AddKey(TStr("<DL>"));
734  BreakTagNmH.AddKey(TStr("<UL>")); BreakTagNmH.AddKey(TStr("<OL>"));
735  BreakTagNmH.AddKey(TStr("<LI>")); BreakTagNmH.AddKey(TStr("<DT>"));
736  BreakTagNmH.AddKey(TStr("<DD>")); BreakTagNmH.AddKey(TStr("<HEAD>"));
737  BreakTagNmH.AddKey(TStr("<TITLE>")); BreakTagNmH.AddKey(TStr("<META>"));
738  BreakTagNmH.AddKey(TStr("<SCRIPT>"));
739  BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<BODY>"));
740  }
741  return BreakTagNmH.IsKey(TagNm);
742 }
743 
745  if ((Tok->GetSym()==hsyBTag)||(Tok->GetSym()==hsyETag)){
746  return IsBreakTag(Tok->GetStr());
747  } else {
748  return false;
749  }
750 }
751 
752 bool THtmlTok::IsHTag(const TStr& TagNm, int& HTagN){
753  if ((TagNm.Len()==4)&&(TagNm[0]=='<')&&(TagNm[1]=='H')&&(TagNm[3]=='>')){
754  char Ch=TagNm[2];
755  if (('1'<=Ch)&&(Ch<='6')){HTagN=Ch-'0'; return true;}
756  else {HTagN=-1; return false;}
757  } else {
758  HTagN=-1; return false;
759  }
760 }
761 
762 PHtmlTok THtmlTok::GetHTok(const bool& IsBTag, const int& HTagN){
763  THtmlLxSym HTagSym=IsBTag?hsyBTag:hsyETag;
764  TStr HTagNm;
765  switch (HTagN){
766  case 1: HTagNm=H1TagNm; break;
767  case 2: HTagNm=H2TagNm; break;
768  case 3: HTagNm=H3TagNm; break;
769  case 4: HTagNm=H4TagNm; break;
770  case 5: HTagNm=H5TagNm; break;
771  case 6: HTagNm=H6TagNm; break;
772  default: Fail;
773  }
774  return PHtmlTok(new THtmlTok(HTagSym, HTagNm));
775 }
776 
778 // Html-Document
779 THtmlDoc::THtmlDoc(const PSIn& SIn, const THtmlDocType& Type, const bool& DoUc):
780  TokV(1000, 0){
781  THtmlLx Lx(SIn);
782  bool MkTok=false; bool InUL=false;
783  while (Lx.GetSym()!=hsyEof){
784  switch (Type){
785  case hdtAll: MkTok=true; break;
786  case hdtStr: MkTok=(Lx.Sym==hsyStr); break;
787  case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break;
788  case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break;
789  case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break;
790  case hdtHRef:
791  MkTok=(Lx.Sym==hsyBTag)&&
792  ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)||
793  (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)||
794  (Lx.UcChA==THtmlTok::MetaTagNm));
795  break;
796  case hdtUL:
797  if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;}
798  MkTok=InUL;
799  if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;}
800  break;
801  default: Fail;
802  }
803  if (MkTok){TokV.Add(Lx.GetTok(DoUc));}
804  }
806 }
807 
809  TChA LnDocChA;
810  // prepare html parsing
811  PSIn HtmlSIn=TStrIn::New(HtmlStr);
812  THtmlLx HtmlLx(HtmlSIn);
813  bool InScript=false;
814  // save text
815  while (HtmlLx.GetSym()!=hsyEof){
816  TStr Str=HtmlLx.ChA;
817  switch (HtmlLx.Sym){
818  case hsyStr:
819  case hsyNum:
820  case hsySSym:
821  if (InScript){break;}
822  if (HtmlLx.PreSpaces>0){LnDocChA+=' ';}
823  LnDocChA+=Str.CStr();
824  break;
825  case hsyBTag:
826  if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
827  if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;}
828  break;
829  case hsyETag:
830  if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
831  if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;}
832  break;
833  default: break;
834  }
835  }
836  // return result
837  return LnDocChA;
838 }
839 
841  const TStr& BaseUrlStr, const bool& OutUrlP, const bool& OutTagsP){
842  // prepare output-string
843  TChA OutChA; OutChA+=' ';
844  // prepare html parsing
845  PSIn HtmlSIn=TStrIn::New(HtmlStr);
846  THtmlLx HtmlLx(HtmlSIn);
847  bool InScript=false;
848  // save text
849  while (HtmlLx.GetSym()!=hsyEof){
850  TStr Str=HtmlLx.ChA;
851  switch (HtmlLx.Sym){
852  case hsyUndef:
853  case hsyUrl:
854  case hsyMTag:
855  break;
856  case hsyStr:
857  case hsyNum:
858  case hsySSym:
859  if (InScript){break;}
860  if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}}
861  OutChA+=Str;
862  break;
863  case hsyBTag:
864  // extract tag name
865  Str=Str.GetSubStr(1, Str.Len()-2);
866  // process tag
867  if (!InScript){
868  // check script tag
869  if (Str=="SCRIPT"){
870  InScript=true; break;}
871  // output tag
872  if (OutTagsP){
873  OutChA+='<'; OutChA+=Str; OutChA+='>';
874  } else {
875  if (OutChA.LastCh()!=' '){OutChA+=' ';}
876  }
877  // check if URL present
878  PHtmlTok Tok=HtmlLx.GetTok();
879  TStr RelUrlStr;
880  if (Tok->IsUrlTok(RelUrlStr)){
881  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
882  if (Url->IsOk()){
883  if (OutUrlP){
884  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
885  OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>";
886  }
887  }
888  }
889  }
890  break;
891  case hsyETag:
892  // extract tag name
893  Str=Str.GetSubStr(1, Str.Len()-2);
894  // process tag
895  if (InScript){
896  if (Str=="SCRIPT"){
897  InScript=false; break;}
898  } else {
899  if (OutTagsP){
900  OutChA+="</"; OutChA+=Str; OutChA+='>';
901  } else {
902  if (OutChA.LastCh()!=' '){OutChA+=' ';}
903  }
904  }
905  break;
906  case hsyEof: break;
907  default: Fail;
908  }
909  }
910  // return string
911  return OutChA;
912 }
913 
914 
915 void THtmlDoc::SaveTxt(const PSOut& SOut, const bool& TxtMode) const {
916  if (TxtMode){
917  for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);}
918  SOut->PutLn();
919  } else {
920  for (int TokN=0; TokN<TokV.Len(); TokN++){
921  SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": ");
922  TokV[TokN]->SaveTxt(SOut);
923  SOut->PutLn();
924  }
925  }
926 }
927 
929  const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr,
930  const bool& OutUrlP, const bool& OutTagsP){
931  // get text-string from html-string
932  TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP);
933  // save text-string
934  TxtStr.SaveTxt(TxtSOut);
935 }
936 
938  const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr,
939  const bool& OutUrlP, const bool& OutTagsP){
940  // create output file
941  PSOut TxtSOut=TFOut::New(TxtFNm);
942  // save to output file
943  SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP);
944 }
945 
947  const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr,
948  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
949  const bool& OutTagsP, const bool& OutArgsP){
950  // prepare output-file-id
951  TFileId fXml=XmlSOut->GetFileId();
952  // create outgoing url
953  TStrV OutUrlStrV;
954  // open top tag
955  fprintf(fXml, "<HtmlDoc>\n");
956  // save url
957  if (!BaseUrlStr.Empty()){
958  TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr);
959  fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr());
960  }
961  // prepare html parsing
962  PSIn HtmlSIn=TStrIn::New(HtmlStr);
963  THtmlLx HtmlLx(HtmlSIn);
964  TChA ContTextChA; bool InScript=false;
965  // save text
966  fprintf(fXml, "<Body>\n");
967  while (HtmlLx.GetSym()!=hsyEof){
968  TStr Str=HtmlLx.ChA;
969  switch (HtmlLx.Sym){
970  case hsyUndef:
971  case hsyUrl:
972  case hsyMTag:
973  break;
974  case hsyStr:
975  if (InScript){break;}
977  if (OutToksP){
978  fprintf(fXml, " <Str>%s</Str>\n", Str.CStr());}
979  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
980  break;
981  case hsyNum:
982  if (InScript){break;}
984  if (OutToksP){
985  fprintf(fXml, " <Num>%s</Num>\n", Str.CStr());}
986  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
987  break;
988  case hsySSym:
989  if (InScript){break;}
991  if (OutToksP){
992  fprintf(fXml, " <Sym>%s</Sym>\n", Str.CStr());}
993  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
994  break;
995  case hsyBTag:{
996  // save continuos text
997  if (!ContTextChA.Empty()){
998  if (OutTextP){
999  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1000  ContTextChA.Clr();
1001  }
1002  // extract tag name
1003  Str=Str.GetSubStr(1, Str.Len()-2);
1005  // process tag
1006  if (!InScript){
1007  // check script tag
1008  if (Str=="SCRIPT"){
1009  InScript=true; break;}
1010  // output tag
1011  if (OutTagsP){
1012  if (OutArgsP){
1013  fprintf(fXml, " <BTag Nm=\"%s\">\n", Str.CStr());
1014  for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){
1015  TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN));
1016  TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN));
1017  fprintf(fXml, " <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr());
1018  }
1019  fprintf(fXml, " </BTag>\n");
1020  } else {
1021  fprintf(fXml, " <BTag Nm=\"%s\"/>\n", Str.CStr());
1022  }
1023  }
1024  // check if URL present
1025  PHtmlTok Tok=HtmlLx.GetTok();
1026  TStr RelUrlStr;
1027  if (Tok->IsUrlTok(RelUrlStr)){
1028  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
1029  if (Url->IsOk()){
1030  OutUrlStrV.Add(Url->GetUrlStr());
1031  if (OutUrlP){
1032  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
1033  fprintf(fXml, " <Url>%s</Url>\n", XmlUrlStr.CStr());
1034  }
1035  }
1036  }
1037  }
1038  break;}
1039  case hsyETag:{
1040  // save continuos text
1041  if (!ContTextChA.Empty()){
1042  if (OutTextP){
1043  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1044  ContTextChA.Clr();
1045  }
1046  // extract tag name
1047  Str=Str.GetSubStr(1, Str.Len()-2);
1049  // process tag
1050  if (InScript){
1051  if (Str=="SCRIPT"){
1052  InScript=false; break;}
1053  } else {
1054  if (OutTagsP){
1055  fprintf(fXml, " <ETag Nm=\"%s\"/>\n", Str.CStr());}
1056  }
1057  break;}
1058  case hsyEof: break;
1059  default: Fail;
1060  }
1061  }
1062  // save continuos text
1063  if (!ContTextChA.Empty()){
1064  if (OutTextP){
1065  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1066  ContTextChA.Clr();
1067  }
1068  fprintf(fXml, "</Body>\n");
1069  // save outgoing urls
1070  fprintf(fXml, "<OutUrls>\n");
1071  for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){
1072  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]);
1073  fprintf(fXml, " <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr());
1074  }
1075  fprintf(fXml, "</OutUrls>\n");
1076 
1077  // close top tag
1078  fprintf(fXml, "</HtmlDoc>\n");
1079 }
1080 
1082  const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr,
1083  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
1084  const bool& OutTagsP, const bool& OutArgsP){
1085  // create output file
1086  PSOut XmlSOut=TFOut::New(XmlFNm);
1087  // save to output file
1088  SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP,
1089  OutToksP, OutTagsP, OutArgsP);
1090 }
1091 
1092 TLxSym THtmlDoc::GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA){
1093  switch (HtmlLxSym){
1094  case hsyUndef: return syUndef;
1095  case hsyStr: return syStr;
1096  case hsyNum: return syFlt;
1097  case hsySSym: return TLxSymStr::GetSSym(ChA);
1098  case hsyUrl: return syStr;
1099  case hsyBTag: return syStr;
1100  case hsyETag: return syStr;
1101  case hsyEof: return syEof;
1102  default: Fail; return syUndef;
1103  }
1104 }
1105 
1107  const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx,
1108  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
1109  IAssert(Lx.Sym==hsyBTag);
1110  if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){
1111  TStr RelUrlStr=Lx.GetArg(ArgNm);
1112  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
1113  if (Url->IsOk(usHttp)){
1114  TStr UrlStr=Url->GetUrlStr();
1115  PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr);
1116  Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr());
1117  return true;
1118  } else {
1119  return false;
1120  }
1121  } else {
1122  return false;
1123  }
1124 }
1125 
1127  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
1128  PSIn SIn=TStrIn::New(HtmlStr);
1129  TMOut SOut;
1130  THtmlLx Lx(SIn);
1131  while (Lx.GetSym()!=hsyEof){
1132  SOut.PutStr(Lx.PreSpaceChA);
1133  if ((Lx.Sym==hsyBTag)&&(
1134  (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1135  (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1136  (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1137  (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){
1138  SOut.PutStr(Lx.GetFullBTagStr());
1139  } else {
1140  SOut.PutStr(Lx.SymChA());
1141  }
1142  }
1143  return SOut.GetAsStr();
1144 }
1145 
1147 // Html-Hyper-Link-Document-Vector
1148 THtmlHldV::THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen):
1149  RefHtmlDoc(_RefHtmlDoc), HldV(){
1150  bool IsTitleAct=false; THtmlTokV TitleTokV;
1151  bool IsHAct=false; int ActHTagN=-1;
1152  TVec<THtmlTokV> HTokV(6);
1153  PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
1154  for (int TokN=0; TokN<RefHtmlDoc->GetToks(); TokN++){
1155  Tok=RefHtmlDoc->GetTok(TokN, TokSym, TokStr);
1156  if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
1157  // collect tokens before, inside and after <a> ... </a> tags
1158  int ATokN; PHtmlTok ATok; THtmlLxSym ATokSym; TStr ATokStr;
1159  // inside <A> tags
1160  THtmlTokV ATokV; ATokN=TokN;
1161  forever{
1162  ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
1163  if (ATokSym!=hsySSym){ATokV.Add(ATok);}
1164  if ((ATokSym==hsyETag)&&(ATokStr==THtmlTok::ATagNm)){break;}
1165  ATokN++;
1166  if (ATokN>=RefHtmlDoc->GetToks()){break;}
1167  }
1168  int ETagATokN=ATokN+1;
1169  // before <A> tags
1170  THtmlTokV PrevATokV; ATokN=TokN;
1171  forever{
1172  ATokN--;
1173  if (ATokN<0){break;}
1174  ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
1175  if (THtmlTok::IsBreakTok(ATok)){break;}
1176  if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){PrevATokV.Add(ATok);}
1177  if (ATokV.Len()>=HldWnLen){break;}
1178  }
1179  // after <A> tags
1180  THtmlTokV NextATokV; ATokN=ETagATokN;
1181  forever{
1182  ATokN++;
1183  if (ATokN>=RefHtmlDoc->GetToks()){break;}
1184  ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
1185  if (THtmlTok::IsBreakTok(ATok)){break;}
1186  if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){NextATokV.Add(ATok);}
1187  if (ATokV.Len()>=HldWnLen){break;}
1188  }
1189  // construct html-document with hyper-link context
1190  PHtmlDoc HtmlDoc=PHtmlDoc(new THtmlDoc());
1191  HtmlDoc->AddTokV(TitleTokV);
1192  for (int HTagN=1; HTagN<=6; HTagN++){HtmlDoc->AddTokV(HTokV[HTagN-1]);}
1193  HtmlDoc->AddTokV(PrevATokV);
1194  HtmlDoc->AddTokV(ATokV);
1195  HtmlDoc->AddTokV(NextATokV);
1196  HldV.Add(HtmlDoc);
1197  HtmlDoc->SaveTxt(TSOut::StdOut);
1198  } else
1199  if (TokSym==hsyBTag){
1200  int HTagN;
1201  if (TokStr==THtmlTok::TitleTagNm){
1202  IsTitleAct=true; TitleTokV.Clr(); TitleTokV.Add(Tok);
1203  } else
1204  if (THtmlTok::IsHTag(TokStr, HTagN)){
1205  if (IsHAct){// conclude previous <H?> tag if left open
1206  HTokV[ActHTagN-1].Add(THtmlTok::GetHTok(false, ActHTagN));}
1207  IsHAct=true; ActHTagN=HTagN;
1208  {for (int HTagN=ActHTagN; HTagN<=6; HTagN++){HTokV[HTagN-1].Clr();}}
1209  HTokV[ActHTagN-1].Add(Tok);
1210  }
1211  } else
1212  if (TokSym==hsyETag){
1213  int HTagN;
1214  if (TokStr==THtmlTok::TitleTagNm){
1215  if (IsTitleAct){TitleTokV.Add(Tok); IsTitleAct=false;}
1216  } else
1217  if (THtmlTok::IsHTag(TokStr, HTagN)){
1218  if (IsHAct){HTokV[ActHTagN-1].Add(Tok); IsHAct=false;}
1219  }
1220  } else
1221  if (TokSym!=hsySSym){
1222  if (IsTitleAct){TitleTokV.Add(Tok);}
1223  if (IsHAct){HTokV[ActHTagN-1].Add(Tok);}
1224  }
1225  }
1226 }
1227 
1229 // Web-Page
1230 void TWebPg::GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const {
1231  // create outgoing url vector
1232  OutUrlV.Clr(); OutRedirUrlV.Clr();
1233  // take interesting web-page components
1234  TStr UrlStr=GetUrlStr();
1235  TStr HtmlStr=GetHttpBodyAsStr();
1236  // prepare html parsing
1237  PSIn HtmlSIn=TStrIn::New(HtmlStr);
1238  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
1239  PHtmlTok Tok;
1240  // traverse html
1241  for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
1242  PHtmlTok Tok=HtmlDoc->GetTok(TokN);
1243  if (Tok->GetSym()==hsyBTag){
1244  TStr RelUrlStr;
1245  if (Tok->IsUrlTok(RelUrlStr)){
1246  PUrl Url=TUrl::New(RelUrlStr, UrlStr);
1247  if (Url->IsOk(usHttp)){
1248  OutUrlV.Add(Url);
1249  if (Tok->IsRedirUrlTok()){
1250  OutRedirUrlV.Add(Url);
1251  }
1252  }
1253  }
1254  }
1255  }
1256 }
1257 
1258 void TWebPg::GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const {
1259  // create outgoing url vector
1260  OutDescUrlStrKdV.Clr();
1261  // take interesting web-page components
1262  TStr UrlStr=GetUrlStr();
1263  TStr HtmlStr=GetHttpBodyAsStr();
1264  // prepare html parsing
1265  PSIn HtmlSIn=TStrIn::New(HtmlStr);
1266  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
1267  // traverse html documents
1268  PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
1269  int TokN=0; int Toks=HtmlDoc->GetToks();
1270  while (TokN<Toks){
1271  Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
1272  if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
1273  TStr RelUrlStr;
1274  if (Tok->IsUrlTok(RelUrlStr)){
1275  PUrl Url=TUrl::New(RelUrlStr, UrlStr);
1276  if (Url->IsOk()){
1277  TChA DescChA;
1278  while (TokN<Toks){
1279  Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
1280  if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
1281  break;
1282  } else {
1283  if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
1284  if (!DescChA.Empty()){DescChA+=' ';}
1285  DescChA+=TokStr;
1286  }
1287  }
1288  }
1289  OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
1290  }
1291  }
1292  }
1293  }
1294 }
1295 
1296 void TWebPg::SaveAsHttpBody(const TStr& FNm) const {
1297  // create output file
1298  PSOut SOut=TFOut::New(FNm);
1299  // save http-body
1300  HttpResp->SaveBody(SOut);
1301 }
1302 
1303 void TWebPg::SaveAsHttp(const TStr& FNm) const {
1304  // create output file
1305  PSOut SOut=TFOut::New(FNm);
1306  // save http
1307  HttpResp->SaveTxt(SOut);
1308 }
1309 
1310 bool TWebPg::IsTxt() const {
1312  TStr Str=HttpResp->GetBodyAsStr();
1313  int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
1314  while ((ChN<100)&&(ChN<StrLen)){
1315  char Ch=Str[ChN++];
1316  if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
1317  PrintChs++;}
1318  }
1319  double PrintPrb=double(PrintChs)/double(ChN+1);
1320  return PrintPrb>0.9;
1321  } else {
1322  return false;
1323  }
1324 }
1325 
PHtmlDoc RefHtmlDoc
Definition: html.h:313
Definition: html.h:252
THtmlDocType
Definition: html.h:251
#define IAssert(Cond)
Definition: bd.h:262
static const TStr H5TagNm
Definition: html.h:227
THtmlLxSym
Definition: html.h:78
bool IsGetETag(const TStr &TagNm)
Definition: html.cpp:547
static const TStr H4TagNm
Definition: html.h:226
static TStr GetCSZFromYuascii(const TChA &ChA)
Definition: html.cpp:111
TChA ArgNm
Definition: html.h:93
TStr GetHRefBeforeStr(const TStr &Str)
Definition: html.cpp:530
TStr GetStr() const
Definition: dt.h:1197
static TStr GetWin1250FromYuascii(const TChA &ChA)
Definition: html.cpp:149
static const TStr FrameTagNm
Definition: html.h:222
static THtmlLxChDef ChDef
Definition: html.h:84
THtmlTok()
Definition: html.h:188
static const TStr H3TagNm
Definition: html.h:225
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
bool IsWs(const char &Ch) const
Definition: html.h:36
int Len() const
Definition: dt.h:487
TIntV ChTyV
Definition: html.h:16
static const TStr H1TagNm
Definition: html.h:223
static const TStr TitleArgNm
Definition: html.h:240
Definition: html.h:252
static const TStr LiTagNm
Definition: html.h:230
TChA EscChA
Definition: html.h:92
Definition: html.h:252
Definition: html.h:79
void MoveToETagOrEof(const TStr &TagNm)
Definition: html.cpp:441
static const TStr HRefArgNm
Definition: html.h:238
TStr GetFullBTagStr() const
Definition: html.cpp:358
TStr GetUrlStr(const int &UrlN=-1) const
Definition: html.h:355
void GetOutDescUrlStrKdV(TStrKdV &OutDescUrlStrKdV) const
Definition: html.cpp:1258
Definition: url.h:5
int SymBChX
Definition: html.h:108
void PutStr(const TStr &Str)
Definition: html.h:128
bool IsNum(const char &Ch) const
Definition: html.h:40
#define forever
Definition: bd.h:6
static bool IsBreakTok(const PHtmlTok &Tok)
Definition: html.cpp:744
Definition: html.h:182
THtmlDoc()
Definition: html.h:258
void MoveToBTagArgOrEof(const TStr &TagNm, const TStr &ArgNm, const TStr &ArgVal)
Definition: html.cpp:400
TStr GetEscStr(const TStr &Str) const
Definition: html.cpp:33
bool Empty() const
Definition: dt.h:260
#define Fail
Definition: bd.h:238
static void GetTokStrV(const TStr &Str, TStrV &TokStrV)
Definition: html.cpp:595
TStr GetUc() const
Definition: dt.h:493
void SaveAsHttp(const TStr &FNm) const
Definition: html.cpp:1303
Definition: html.h:79
TLxSym
Definition: lx.h:44
void Clr()
Definition: dt.h:258
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
TStr GetFullStr() const
Definition: html.cpp:628
bool IsEoln(const char &Ch) const
Definition: html.h:35
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
void SaveTxt(const PSOut &SOut) const
Definition: http.h:205
void PutCh(const char &_Ch)
Definition: html.h:126
bool DoParseArg
Definition: html.h:87
THtmlLxSym GetSym()
Definition: html.cpp:277
TStr GetArg(const TStr &ArgNm) const
Definition: html.h:207
static const TStr MetaTagNm
Definition: html.h:231
Definition: html.h:12
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
int Len() const
Definition: dt.h:259
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
void MoveToBTagArg2OrEof(const TStr &TagNm, const TStr &ArgNm1, const TStr &ArgVal1, const TStr &ArgNm2, const TStr &ArgVal2, const bool &AndOpP=true)
Definition: html.cpp:410
static TStr GetEscapedStr(const TChA &ChA)
Definition: html.cpp:568
bool IsOk(const TUrlScheme _Scheme=usUndef) const
Definition: url.h:32
static bool IsBreakTag(const TStr &TagNm)
Definition: html.cpp:726
TStr GetStr() const
Definition: html.h:203
static PHtmlLxChDef ChDef
Definition: html.h:65
static const TStr TitleTagNm
Definition: html.h:234
char Ch
Definition: html.h:89
int PutLn(const int &Lns=1)
Definition: fl.cpp:158
void GetTag()
Definition: html.cpp:236
bool IsArg(const TStr &ArgNm) const
Definition: html.h:205
bool IsGetBTag(const TStr &TagNm)
Definition: html.cpp:541
TStr GetAsStr() const
Definition: fl.cpp:869
static const TStr AreaTagNm
Definition: html.h:218
TPt< THtmlDoc > PHtmlDoc
Definition: html.h:6
static TLxSym GetLxSym(const THtmlLxSym &HtmlLxSym, const TChA &ChA)
Definition: html.cpp:1092
THtmlLxSym GetSym() const
Definition: html.h:202
Definition: html.h:11
TStr Str
Definition: html.h:185
void SaveAsHttpBody(const TStr &FNm) const
Definition: html.cpp:1296
Definition: html.h:11
TChA PreSpaceChA
Definition: html.h:113
static const TStr HttpEquivArgNm
Definition: html.h:241
TStr GetArgVal(const int &ArgN) const
Definition: html.h:137
void SetChTy(const THtmlLxChTy &ChTy, const TStr &Str)
Definition: html.cpp:24
static const char EofCh
Definition: dt.h:1037
Definition: html.h:252
static const char Mx
Definition: dt.h:1030
Definition: html.h:252
static const TStr H2TagNm
Definition: html.h:224
static const TStr ATagNm
Definition: html.h:217
static const TStr CenterTagNm
Definition: html.h:221
PHttpResp HttpResp
Definition: html.h:334
Definition: lx.h:45
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
THtmlTokV TokV
Definition: html.h:256
THtmlLxChDef()
Definition: html.cpp:48
TPt< THtmlLxChDef > PHtmlLxChDef
Definition: html.h:14
bool IsArg(const TStr &ArgNm) const
Definition: html.h:138
char * CStr()
Definition: dt.h:255
static const TStr UlTagNm
Definition: html.h:233
static TStr GetNoTag(const TStr &Str)
Definition: html.cpp:606
virtual TFileId GetFileId() const
Definition: fl.h:146
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1229
Definition: dt.h:1025
static const TStr CardTagNm
Definition: html.h:220
static const char TabCh
Definition: dt.h:1034
static const char Mn
Definition: dt.h:1029
TStr GetArgNm(const int &ArgN) const
Definition: html.h:136
Definition: lx.h:51
static PSIn New(const TStr &Str)
Definition: dt.h:708
TStr GetStrToETag(const TStr &TagNm, const bool &TxtOnlyP=false)
Definition: html.cpp:494
void PutArg(const TStr &ArgNm, const TStr &ArgVal)
Definition: html.h:142
char LastCh() const
Definition: dt.h:281
bool EscCh
Definition: html.h:91
Definition: html.h:80
PHtmlTok GetTok(const bool &DoUc=true)
Definition: html.cpp:353
int SymEChX
Definition: html.h:108
Definition: html.h:12
TStr GetArg(const TStr &ArgNm, const TStr &DfArgVal=TStr()) const
Definition: html.h:139
TStr GetBodyAsStr() const
Definition: http.h:170
int GetToks() const
Definition: html.h:270
Definition: html.h:11
static PUrlEnv New()
Definition: url.h:113
bool IsAlNum(const char &Ch) const
Definition: html.h:41
TStr GetStrInTag(const TStr &TagNm, const bool &TxtOnlyP=false)
Definition: html.cpp:525
static PHtmlTok GetHTok(const bool &IsBTag, const int &HTagN)
Definition: html.cpp:762
void SaveTxt(const PSOut &SOut, const bool &TxtMode=true) const
Definition: html.cpp:915
TStr GetUrlStr() const
Definition: url.h:36
TChA UcChA
Definition: html.h:110
void SetEscStr(const TStr &SrcStr, const TStr &DstStr)
Definition: html.cpp:29
static TStr GetCSZFromWin1250(const TChA &ChA)
Definition: html.cpp:132
static const TStr H6TagNm
Definition: html.h:228
static TStr GetIsoCeFromYuascii(const TChA &ChA)
Definition: html.cpp:170
unsigned char uchar
Definition: bd.h:10
void MoveToBTagOrETagOrEof(const TStr &BTagNm, const TStr &ETagNm)
Definition: html.cpp:394
static const TPt< TSOut > StdOut
Definition: fl.h:208
TChA ChA
Definition: html.h:109
Definition: html.h:79
bool IsRedirUrlTok() const
Definition: html.cpp:676
Definition: lx.h:45
void MoveToBTag3OrEof(const TStr &TagNm1, const TStr &TagNm2, const TStr &TagNm3)
Definition: html.cpp:388
static const char LfCh
Definition: dt.h:1035
int GetChTy(const char &Ch) const
Definition: html.h:34
void GetMetaTag()
Definition: html.cpp:225
static void SaveHtmlToTxt(const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP)
Definition: html.cpp:928
TStrStrH EscStrH
Definition: html.h:19
Definition: dt.h:1134
bool IsTxt() const
Definition: html.cpp:1310
int ChX
Definition: html.h:90
bool IsSpace(const char &Ch) const
Definition: html.h:38
int GetKeyId(const TKey &Key) const
Definition: hash.h:466
Definition: fl.h:495
void AddTokV(const THtmlTokV &_TokV)
Definition: html.h:274
Definition: dt.h:201
TStr GetTextOnlyStrToEof()
Definition: html.cpp:447
static bool IsHTag(const TStr &TagNm, int &HTagN)
Definition: html.cpp:752
Definition: html.h:79
TChV LcChV
Definition: html.h:18
Definition: html.h:80
PSIn SIn
Definition: html.h:85
void GetCh()
Definition: html.h:95
Definition: html.h:12
int AddKey(const TKey &Key)
Definition: hash.h:373
static const TStr ImgTagNm
Definition: html.h:229
static const TStr PTagNm
Definition: html.h:232
TStr GetStrToETag2(const TStr &TagNm1, const TStr &TagNm2, const bool &TxtOnlyP=false)
Definition: html.cpp:509
static TLxSym GetSSym(const TStr &Str)
Definition: lx.cpp:186
void MoveToStrOrEof(const TStr &Str)
Definition: html.cpp:370
void GetOutUrlV(TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const
Definition: html.cpp:1230
static void SaveHtmlToXml(const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP)
Definition: html.cpp:946
bool IsAlpha(const char &Ch) const
Definition: html.h:39
static TStr GetAsciiStr(const TChA &ChA, const char &GenericCh='_')
Definition: html.cpp:584
void GetEscCh()
Definition: html.cpp:195
Definition: html.h:80
static const TStr AltArgNm
Definition: html.h:237
Definition: dt.h:412
TChA SymChA
Definition: html.h:111
THtmlHldV(const PHtmlDoc &_RefHtmlDoc, const int &HldWnLen=10)
Definition: html.cpp:1148
bool Empty() const
Definition: dt.h:488
TPt< THtmlTok > PHtmlTok
Definition: html.h:5
char GetUc(const char &Ch) const
Definition: html.h:52
bool IsUrlTok(TStr &RelUrlStr) const
Definition: html.cpp:648
THtmlLxSym Sym
Definition: html.h:107
Definition: html.h:79
int PutStr(const char *CStr)
Definition: fl.cpp:117
void MoveToBTag2OrEof(const TStr &TagNm1, const TStr &TagNm2)
Definition: html.cpp:382
Definition: html.h:252
static const char CrCh
Definition: dt.h:1036
FILE * TFileId
Definition: bd.h:17
bool IsContType() const
Definition: http.h:192
int GetArgs() const
Definition: html.h:135
Definition: html.h:80
static const TStr TextFldVal
Definition: http.h:25
Definition: html.h:82
Definition: bd.h:196
THtmlLx::TArgNmValV ArgNmValV
Definition: html.h:186
TChV UcChV
Definition: html.h:17
static const TStr BrTagNm
Definition: html.h:219
Definition: html.h:254
THtmlLxChTy
Definition: html.h:10
void SplitOnStr(const TStr &SplitStr, TStrV &StrV) const
Definition: dt.cpp:1008
static TStr GetRedirHtmlDocStr(const TStr &HtmlStr, const TStr &BaseUrlStr, const TStr &RedirUrlStr)
Definition: html.cpp:1126
TArgNmValV ArgNmValV
Definition: html.h:115
TChA ArgVal
Definition: html.h:94
void SaveBody(const PSOut &SOut) const
Definition: http.h:207
static TStr GetTxtLnDoc(const TStr &HtmlStr)
Definition: html.cpp:808
void SaveTxt(const PSOut &SOut, const bool &TxtMode=true)
Definition: html.cpp:691
Definition: html.h:11
THtmlLxSym Sym
Definition: html.h:184
int PreSpaces
Definition: html.h:112
char * CStr()
Definition: dt.h:476
char Pop()
Definition: dt.h:265
void MoveToBTagOrEof(const TStr &TagNm)
Definition: html.cpp:376
bool IsKey(const TKey &Key) const
Definition: hash.h:258
static PHtmlDoc New(const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true)
Definition: html.h:261
Definition: lx.h:45
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
int Len() const
Definition: hash.h:228
static const TStr TitleETagNm
Definition: html.h:235
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
PHtmlTok GetTok(const int &TokN) const
Definition: html.h:271
static const TStr SrcArgNm
Definition: html.h:239
static bool _IsTagRedir(const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr)
Definition: html.cpp:1106
THtmlDocV HldV
Definition: html.h:314
void SetUcCh(const char &UcCh, const char &LcCh)
Definition: html.cpp:3
TKeyDat< TStr, TStr > TStrKd
Definition: ds.h:405
static TStr GetSymStr(const THtmlLxSym &Sym)
Definition: html.cpp:553
TStr GetFullUrlStr() const
Definition: url.cpp:445
void SaveTxt(const PSOut &SOut) const
Definition: dt.h:670
static TStr GetXmlStrFromPlainStr(const TChA &PlainChA)
Definition: xml.cpp:968
TStr GetStrToBTag(const TStr &TagNm, const bool &TxtOnlyP=false)
Definition: html.cpp:462
TStr GetHttpBodyAsStr() const
Definition: html.h:368