SNAP Library 6.0, Developer Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
html.cpp
Go to the documentation of this file.
1 // Html-Lexical-Chars
3 void THtmlLxChDef::SetUcCh(const char& UcCh, const char& LcCh){
4  // update upper-case (more lower cases may have one upper case)
5  IAssert(
6  (UcChV[LcCh-TCh::Mn]==TCh(0))||
7  (UcChV[LcCh-TCh::Mn]==TCh(LcCh)));
8  UcChV[LcCh-TCh::Mn]=TCh(UcCh);
9  // update lower-case (one upper case may have only one lower case)
10  if ((LcChV[UcCh-TCh::Mn]==TCh(0))||(LcChV[UcCh-TCh::Mn]==TCh(UcCh))){
11  LcChV[UcCh-TCh::Mn]=TCh(LcCh);
12  }
13 }
14 
15 void THtmlLxChDef::SetUcCh(const TStr& Str){
16  // set type of characters as letters
17  SetChTy(hlctAlpha, Str);
18  // first char in string is upper-case, rest are lower-case
19  for (int ChN=1; ChN<Str.Len(); ChN++){
20  SetUcCh(Str[0], Str[ChN]);
21  }
22 }
23 
24 void THtmlLxChDef::SetChTy(const THtmlLxChTy& ChTy, const TStr& Str){
25  for (int ChN=0; ChN<Str.Len(); ChN++){
26  ChTyV[Str[ChN]-TCh::Mn]=TInt(ChTy);}
27 }
28 
29 void THtmlLxChDef::SetEscStr(const TStr& SrcStr, const TStr& DstStr){
30  EscStrH.AddDat(SrcStr, DstStr);
31 }
32 
33 TStr THtmlLxChDef::GetEscStr(const TStr& Str) const {
34  int EscStrId;
35  if ((EscStrId=EscStrH.GetKeyId(Str))!=-1){
36  return EscStrH[EscStrId];
37  } else
38  if ((Str.Len()>=2)&&(Str[0]=='&')&&(Str[1]=='#')){
39  int ChCd=0;
40  for (int ChN=2; ChN<Str.Len(); ChN++){
41  if (ChCd<=0xFFFF){ChCd=ChCd*10+Str[ChN]-'0';}}
42  return TStr((char)ChCd);
43  } else {
44  return TStr(' ');
45  }
46 }
47 
49  ChTyV(TCh::Vals), UcChV(TCh::Vals), LcChV(TCh::Vals), EscStrH(100){
50 
51  // Character-Types
53  SetChTy(hlctAlpha, "ABCDEFGHIJKLMNOPQRSTUVWXYZ");
54  SetChTy(hlctAlpha, "abcdefghijklmnopqrstuvwxyz");
55  SetChTy(hlctAlpha, "@_");
56  SetChTy(hlctNum, "0123456789");
57  SetChTy(hlctSym, "`~!#$%^&*()-=+[{]}\\|;:'\",<.>/?");
58  SetChTy(hlctLTag, "<"); SetChTy(hlctRTag, ">");
60  for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
61  if ((Ch<0)||(127<Ch)){SetChTy(hlctAlpha, TStr(TCh(char(Ch))));}}
62  //SetChTy(hlctSpace, TStr(TCh(char(160))));
63 
64  // Upper-Case
65  {for (int Ch=TCh::Mn; Ch<=TCh::Mx; Ch++){
66  SetUcCh(char(Ch), char(Ch));}}
67  SetUcCh("Aa"); SetUcCh("\xc0\xe0"); SetUcCh("\xc1\xe1"); SetUcCh("\xc2\xe2");
68  SetUcCh("\xc3\xe3"); SetUcCh("\xc4\xe4"); SetUcCh("\xc5\xe5"); SetUcCh("\xc6\xe6");
69  SetUcCh("Bb"); SetUcCh("Cc"); SetUcCh("\xc7\xe7"); SetUcCh("Dd");
70  SetUcCh("\xd0\xf0"); SetUcCh("Ee"); SetUcCh("\xc8\xe8"); SetUcCh("\xc9\xe9");
71  SetUcCh("\xca\xea"); SetUcCh("\xcb\xeb"); SetUcCh("Ff"); SetUcCh("Gg");
72  SetUcCh("Hh"); SetUcCh("Ii"); SetUcCh("\xcc\xec"); SetUcCh("\xcd\xed");
73  SetUcCh("\xce\xee"); SetUcCh("\xcf\xef"); SetUcCh("Jj"); SetUcCh("Kk");
74  SetUcCh("Ll"); SetUcCh("Mm"); SetUcCh("Nn"); SetUcCh("\xd1\xf1");
75  SetUcCh("Oo"); SetUcCh("\xd2\xf2"); SetUcCh("\xd3\xf3"); SetUcCh("\xd4\xf4");
76  SetUcCh("\xd5\xf5"); SetUcCh("\xd6\xf6"); SetUcCh("\xd8\xf8"); SetUcCh("Pp");
77  SetUcCh("Qq"); SetUcCh("Rr"); SetUcCh("Ss"); SetUcCh("\x8a\x9a");
78  SetUcCh("Tt"); SetUcCh("Uu"); SetUcCh("\xd9\xf9"); SetUcCh("\xda\xfa");
79  SetUcCh("\xdb\xfb"); SetUcCh("\xdc\xfc"); SetUcCh("Vv"); SetUcCh("Ww");
80  SetUcCh("Xx"); SetUcCh("Yy\xff"); SetUcCh("\xdd\xfd"); SetUcCh("Zz");
81  SetUcCh("\x8e\x9e");
82  // ISO-CE
83  //SetUcCh(uchar(169), uchar(185)); /*Sh - \xa9\xb9*/
84  //SetUcCh(uchar(174), uchar(190)); /*Zh - \xae\xbe*/
85  //SetUcCh(uchar(200), uchar(232)); /*Ch - \xc8\xe8*/
86  //SetUcCh(uchar(198), uchar(230)); /*Cs - \xc6\xe6*/
87  //SetUcCh(uchar(208), uchar(240)); /*Dz - \xd0\xf0*/
88 
89  // Annoying Unicode-characters
90  //SetChTy(hlctSpace, "\xc2\xef");
91 
92  // Escape-Sequences
93  SetEscStr("&quot", "\""); SetEscStr("&amp", "&");
94  SetEscStr("&lt", "<"); SetEscStr("&gt", ">");
95  SetEscStr("&nbsp", " ");
96 
97  SetEscStr("&auml", "\xe4"); SetEscStr("&Auml", "\xc4");
98  SetEscStr("&ouml", "\xf6"); SetEscStr("&Ouml", "\xd6");
99  SetEscStr("&uuml", "\xfc"); SetEscStr("&Uuml", "\xdc");
100  SetEscStr("&aring", "\xe5"); SetEscStr("&Aring", "\xc5");
101  SetEscStr("&oslash", "\xf8"); SetEscStr("&Oslash", "\xd8");
102  SetEscStr("&Aelig", "\xc6"); SetEscStr("&aelig", "\xe6");
103 
104  SetEscStr("&eacute", "e"); SetEscStr("&Eacute", "E");
105  SetEscStr("&egrave", "e"); SetEscStr("&Egrave", "E");
106  SetEscStr("&agrave", "a"); SetEscStr("&Agrave", "A");
107 }
108 
110 
112  TChA DstChA;
113  for (int ChN=0; ChN<ChA.Len(); ChN++){
114  char Ch=ChA[ChN];
115  switch (Ch){
116  case '~': DstChA+='c'; break;
117  case '^': DstChA+='C'; break;
118  case '}': DstChA+='c'; break;
119  case ']': DstChA+='C'; break;
120  case '|': DstChA+='d'; break;
121  case '\\': DstChA+='D'; break;
122  case '{': DstChA+='s'; break;
123  case '[': DstChA+='S'; break;
124  case '`': DstChA+='z'; break;
125  case '@': DstChA+='Z'; break;
126  default: DstChA+=Ch;
127  }
128  }
129  return DstChA;
130 }
131 
133  TChA DstChA;
134  for (int ChN=0; ChN<ChA.Len(); ChN++){
135  const uchar Ch=ChA[ChN];
136  switch (Ch){
137  case 232: DstChA+='c'; break;
138  case 200: DstChA+='C'; break;
139  case 154: DstChA+='s'; break;
140  case 138: DstChA+='S'; break;
141  case 158: DstChA+='z'; break;
142  case 142: DstChA+='Z'; break;
143  default: DstChA+=Ch;
144  }
145  }
146  return DstChA;
147 }
148 
150  TChA DstChA;
151  for (int ChN=0; ChN<ChA.Len(); ChN++){
152  char Ch=ChA[ChN];
153  switch (Ch){
154  case '~': DstChA+=uchar(232); break;
155  case '^': DstChA+=uchar(200); break;
156  case '}': DstChA+='c'; break;
157  case ']': DstChA+='C'; break;
158  case '|': DstChA+='d'; break;
159  case '\\': DstChA+='D'; break;
160  case '{': DstChA+=uchar(154); break;
161  case '[': DstChA+=uchar(138); break;
162  case '`': DstChA+=uchar(158); break;
163  case '@': DstChA+=uchar(142); break;
164  default: DstChA+=Ch;
165  }
166  }
167  return DstChA;
168 }
169 
171  TChA DstChA;
172  for (int ChN=0; ChN<ChA.Len(); ChN++){
173  char Ch=ChA[ChN];
174  switch (Ch){
175  case '~': DstChA+=uchar(232); break;
176  case '^': DstChA+=uchar(200); break;
177  case '}': DstChA+=uchar(230); break;
178  case ']': DstChA+=uchar(198); break;
179  case '|': DstChA+=uchar(240); break;
180  case '\\': DstChA+=uchar(208); break;
181  case '{': DstChA+=uchar(185); break;
182  case '[': DstChA+=uchar(169); break;
183  case '`': DstChA+=uchar(190); break;
184  case '@': DstChA+=uchar(174); break;
185  default: DstChA+=Ch;
186  }
187  }
188  return DstChA;
189 }
190 
192 // Html-Lexical
194 
196  GetCh();
197  EscCh=(Ch=='&');
198  if (EscCh){
199  EscChA.Clr(); EscChA.AddCh(Ch); GetCh();
200  if (Ch=='#'){
201  EscChA.AddCh(Ch); GetCh();
202  if (('0'<=Ch)&&(Ch<='9')){
203  do {EscChA.AddCh(Ch); GetCh();} while (('0'<=Ch)&&(Ch<='9'));
204  if (Ch==';'){GetCh();}
206  } else {
207  PutCh('#'); PutCh('&');
208  }
209  } else
210  if ((('a'<=Ch)&&(Ch<='z'))||(('A'<=Ch)&&(Ch<='Z'))){
211  do {
212  EscChA.AddCh(Ch); GetCh();
213  } while ((('A'<=Ch)&&(Ch<='Z'))||(('a'<=Ch)&&(Ch<='z'))||(('0'<=Ch)&&(Ch<='9')));
214  if (Ch==';'){
216  } else {
217  PutStr(EscChA);
218  }
219  } else {
220  PutCh('&');
221  }
222  }
223 }
224 
226  Sym=hsyMTag;
227  if (Ch=='-'){
228  char PCh=' ';
229  while ((Ch!=TCh::EofCh) && ((PCh!='-')||(Ch!='>'))){PCh=Ch; GetCh();}
230  } else {
231  while ((Ch!=TCh::EofCh) && (Ch!='>')){GetCh();}
232  }
233  if (Ch!=TCh::EofCh){GetEscCh();}
234 }
235 
237  if (Ch=='/'){Sym=hsyETag; GetCh();} else {Sym=hsyBTag;}
238  UcChA.AddCh('<');
239  while (ChDef.IsAlNum(Ch)||(Ch==':')){
240  UcChA.AddCh(ChDef.GetUc(Ch)); GetCh();}
241  UcChA.AddCh('>');
242  ChA=UcChA;
243 
244  if (DoParseArg){
245  while ((Ch!='>')&&(Ch!=TCh::EofCh)){
246  while ((!ChDef.IsAlpha(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
247  if (ChDef.IsAlpha(Ch)){
248  ArgNm.Clr(); ArgVal.Clr();
249  while (ChDef.IsAlNum(Ch)||(Ch=='-')){ArgNm.AddCh(ChDef.GetUc(Ch)); GetCh();}
250  while (ChDef.IsWs(Ch)){GetCh();}
251  if (Ch=='='){
252  GetCh(); while (ChDef.IsWs(Ch)){GetCh();}
253  if (Ch=='"'){
254  GetCh();
255  while ((Ch!=TCh::EofCh)&&(Ch!='"')&&(Ch!='>')){
256  if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
257  if (Ch=='"'){GetCh();}
258  } else if (Ch=='\''){
259  GetCh();
260  while ((Ch!=TCh::EofCh)&&(Ch!='\'')&&(Ch!='>')){
261  if (!ChDef.IsEoln(Ch)){ArgVal.AddCh(Ch);} GetCh();}
262  if (Ch=='\''){GetCh();}
263  } else {
264  while ((!ChDef.IsWs(Ch))&&(Ch!='>')&&(Ch!=TCh::EofCh)){
265  ArgVal.AddCh(Ch); GetCh();}
266  }
268  }
269  }
270  }
271  } else {
272  while ((Ch!='>')&&(Ch!=TCh::EofCh)){GetCh();}
273  }
274  if (Ch!=TCh::EofCh){GetEscCh();}
275 }
276 
278  // prepare symbol descriptions
279  ChA.Clr(); UcChA.Clr();
281  ArgNmValV.Clr();
282  // skip white-space
283  while (ChDef.IsSpace(Ch)){
284  if (ChX>0){PreSpaceChA+=Ch; PreSpaces++;} GetEscCh();}
285  // parse symbol
286  SymChA.Clr(); SymChA+=Ch; SymBChX=ChX;
287  switch (ChDef.GetChTy(Ch)){
288  case hlctAlpha:
289  Sym=hsyStr;
290  forever{
291  do {
293  } while (ChDef.IsAlNum(Ch));
294  if (Ch=='.'){
295  GetCh();
296  if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
297  else {PutCh(Ch); Ch='.'; break;}
298  } else {break;}
299  }
300  break;
301  case hlctNum:
302  Sym=hsyNum;
303  forever{
304  do {
305  ChA.AddCh(Ch); UcChA.AddCh(Ch); GetEscCh();
306  } while (ChDef.IsNum(Ch));
307  if (Ch=='.'){
308  GetCh();
309  if (ChDef.IsAlNum(Ch)){ChA.AddCh('.'); UcChA.AddCh('.');}
310  else {PutCh(Ch); Ch='.'; break;}
311  } else if (ChDef.IsAlpha(Ch)){
312  Sym=hsyStr;
313  } else {
314  break;
315  }
316  }
317  break;
318  case hlctSym:
320  if ((ChA.LastCh()=='.')&&(ChDef.IsAlNum(Ch))){
321  Sym=hsyStr;
322  do {
324  } while (ChDef.IsAlNum(Ch));
325  }
326  break;
327  case hlctLTag:
328  if (EscCh){
330  } else {
331  GetCh();
332  if (Ch=='!'){GetCh(); GetMetaTag();} else {GetTag();}
333  }
334  break;
335  case hlctRTag:
336  if (EscCh){
338  } else {
340  }
341  break;
342  case hlctEof: Sym=hsyEof; break;
343  default: Sym=hsyUndef; GetEscCh();
344  }
345  // set symbol last-character-position
346  SymEChX=ChX-1;
347  // delete last character
348  if (!SymChA.Empty()){SymChA.Pop();}
349  // return symbol
350  return Sym;
351 }
352 
353 PHtmlTok THtmlLx::GetTok(const bool& DoUc){
354  if (DoUc){return PHtmlTok(new THtmlTok(Sym, UcChA, ArgNmValV));}
355  else {return PHtmlTok(new THtmlTok(Sym, ChA, ArgNmValV));}
356 }
357 
359  IAssert(Sym==hsyBTag);
360  TChA BTagChA;
361  BTagChA+=ChA; BTagChA.Pop();
362  for (int ArgN=0; ArgN<GetArgs(); ArgN++){
363  BTagChA+=' '; BTagChA+=GetArgNm(ArgN);
364  BTagChA+='='; BTagChA+='"'; BTagChA+=GetArgVal(ArgN); BTagChA+='"';
365  }
366  BTagChA+='>';
367  return BTagChA;
368 }
369 
370 void THtmlLx::MoveToStrOrEof(const TStr& Str){
371  do {
372  GetSym();
373  } while ((Sym!=hsyEof)&&((Sym!=hsyStr)||(ChA!=Str)));
374 }
375 
376 void THtmlLx::MoveToBTagOrEof(const TStr& TagNm){
377  do {
378  GetSym();
379  } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||(UcChA!=TagNm)));
380 }
381 
382 void THtmlLx::MoveToBTag2OrEof(const TStr& TagNm1, const TStr& TagNm2){
383  do {
384  GetSym();
385  } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2))));
386 }
387 
388 void THtmlLx::MoveToBTag3OrEof(const TStr& TagNm1, const TStr& TagNm2, const TStr& TagNm3){
389  do {
390  GetSym();
391  } while ((Sym!=hsyEof)&&((Sym!=hsyBTag)||((UcChA!=TagNm1)&&(UcChA!=TagNm2)&&(UcChA!=TagNm3))));
392 }
393 
394 void THtmlLx::MoveToBTagOrETagOrEof(const TStr& BTagNm, const TStr& ETagNm){
395  do {
396  GetSym();
397  } while ((Sym!=hsyEof) && ((Sym!=hsyBTag)||(UcChA!=BTagNm)) && ((Sym!=hsyETag) || (UcChA!=ETagNm)));
398 }
399 
401  const TStr& TagNm, const TStr& ArgNm, const TStr& ArgVal){
402  forever {
403  GetSym();
404  if (Sym==hsyEof){break;}
405  if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
406  (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal)){break;}
407  }
408 }
409 
411  const TStr& ArgNm1, const TStr& ArgVal1,
412  const TStr& ArgNm2, const TStr& ArgVal2, const bool& AndOpP){
413  forever {
414  GetSym();
415  if (Sym==hsyEof){break;}
416  if (AndOpP){
417  if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
418  (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)&&
419  (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
420  } else {
421  if ((Sym==hsyBTag)&&(UcChA==TagNm)&&
422  (((IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1))||
423  ((IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)))){break;}
424  }
425  }
426 }
427 
429  const TStr& TagNm1, const TStr& ArgNm1, const TStr& ArgVal1,
430  const TStr& TagNm2, const TStr& ArgNm2, const TStr& ArgVal2){
431  forever {
432  GetSym();
433  if (Sym==hsyEof){break;}
434  if ((Sym==hsyBTag)&&(UcChA==TagNm1)&&
435  (IsArg(ArgNm1))&&(GetArg(ArgNm1)==ArgVal1)){break;}
436  if ((Sym==hsyBTag)&&(UcChA==TagNm2)&&
437  (IsArg(ArgNm2))&&(GetArg(ArgNm2)==ArgVal2)){break;}
438  }
439 }
440 
441 void THtmlLx::MoveToETagOrEof(const TStr& TagNm){
442  do {
443  GetSym();
444  } while ((Sym!=hsyEof)&&((Sym!=hsyETag)||(UcChA!=TagNm)));
445 }
446 
448  TChA OutChA;
449  forever {
450  GetSym();
451  if (Sym==hsyEof){
452  break;
453  } else {
454  if (PreSpaces>0){OutChA+=' ';}
455  if ((Sym!=hsyBTag)&&(Sym!=hsyETag)){
456  OutChA+=ChA;}
457  }
458  }
459  return OutChA;
460 }
461 
462 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const bool& TxtOnlyP){
463  TChA OutChA;
464  forever {
465  GetSym();
466  if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm))){
467  break;
468  } else {
469  if (PreSpaces>0){OutChA+=' ';}
470  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
471  OutChA+=ChA;}
472  }
473  }
474  return OutChA;
475 }
476 
477 TStr THtmlLx::GetStrToBTag(const TStr& TagNm, const TStr& ArgNm,
478  const TStr& ArgVal, const bool& TxtOnlyP){
479  TChA OutChA;
480  forever {
481  GetSym();
482  if ((Sym==hsyEof)||((Sym==hsyBTag)&&(UcChA==TagNm)&&
483  (IsArg(ArgNm))&&(GetArg(ArgNm)==ArgVal))){
484  break;
485  } else {
486  if (PreSpaces>0){OutChA+=' ';}
487  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
488  OutChA+=ChA;}
489  }
490  }
491  return OutChA;
492 }
493 
494 TStr THtmlLx::GetStrToETag(const TStr& TagNm, const bool& TxtOnlyP){
495  TChA OutChA;
496  forever {
497  GetSym();
498  if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm))){
499  break;
500  } else {
501  if (PreSpaces>0){OutChA+=' ';}
502  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
503  OutChA+=ChA;}
504  }
505  }
506  return OutChA;
507 }
508 
510  const TStr& TagNm2, const bool& TxtOnlyP){
511  TChA OutChA;
512  forever {
513  GetSym();
514  if ((Sym==hsyEof)||((Sym==hsyETag)&&(UcChA==TagNm1))||((Sym==hsyETag)&&(UcChA==TagNm2))){
515  break;
516  } else {
517  if (PreSpaces>0){OutChA+=' ';}
518  if ((TxtOnlyP&&(Sym!=hsyBTag)&&(Sym!=hsyETag))||(!TxtOnlyP)){
519  OutChA+=ChA;}
520  }
521  }
522  return OutChA;
523 }
524 
525 TStr THtmlLx::GetStrInTag(const TStr& TagNm, const bool& TxtOnlyP){
526  MoveToBTagOrEof(TagNm);
527  return GetStrToETag(TagNm, TxtOnlyP);
528 }
529 
531  TStr HRefStr;
532  forever {
533  GetSym();
534  if (Sym==hsyEof){HRefStr=""; break;}
535  if ((Sym==hsyBTag)&&(UcChA=="<A>")){HRefStr=GetArg("HREF");}
536  if ((Sym==hsyStr)&&(ChA==Str)){break;}
537  }
538  return HRefStr;
539 }
540 
541 bool THtmlLx::IsGetBTag(const TStr& TagNm){
542  if (GetSym()==hsyBTag){
543  return ChA==TagNm;
544  } else {return false;}
545 }
546 
547 bool THtmlLx::IsGetETag(const TStr& TagNm){
548  if (GetSym()==hsyETag){
549  return ChA==TagNm;
550  } else {return false;}
551 }
552 
554  switch (Sym){
555  case hsyUndef: return "Undef";
556  case hsyStr: return "Str";
557  case hsyNum: return "Num";
558  case hsySSym: return "SSym";
559  case hsyUrl: return "Url";
560  case hsyBTag: return "BTag";
561  case hsyETag: return "ETag";
562  case hsyMTag: return "MTag";
563  case hsyEof: return "Eof";
564  default: Fail; return TStr();
565  }
566 }
567 
569  TChA EscapedChA;
570  for (int ChN=0; ChN<ChA.Len(); ChN++){
571  char Ch=ChA[ChN];
572  switch (Ch){
573  case '"': EscapedChA+="&quot;"; break;
574  case '&': EscapedChA+="&amp;"; break;
575  case '\'': EscapedChA+="&apos;"; break;
576  case '<': EscapedChA+="&lt;"; break;
577  case '>': EscapedChA+="&gt;"; break;
578  default: EscapedChA+=Ch;
579  }
580  }
581  return EscapedChA;
582 }
583 
584 TStr THtmlLx::GetAsciiStr(const TChA& ChA, const char& GenericCh){
585  TChA AsciiChA;
586  for (int ChN=0; ChN<ChA.Len(); ChN++){
587  char Ch=ChA[ChN];
588  if ((Ch<' ')||('~'<Ch)){
589  Ch=GenericCh;}
590  AsciiChA+=Ch;
591  }
592  return AsciiChA;
593 }
594 
595 void THtmlLx::GetTokStrV(const TStr& Str, TStrV& TokStrV){
596  PSIn SIn=TStrIn::New(Str);
597  THtmlLx Lx(SIn);
598  Lx.GetSym();
599  TokStrV.Clr();
600  while (Lx.Sym!=hsyEof){
601  TokStrV.Add(Lx.ChA);
602  Lx.GetSym();
603  }
604 }
605 
607  PSIn SIn=TStrIn::New(Str);
608  THtmlLx Lx(SIn);
609  Lx.GetSym();
610  TChA ChA;
611  while (Lx.Sym!=hsyEof){
612  switch (Lx.Sym){
613  case hsyUndef:
614  case hsyStr:
615  case hsyNum:
616  case hsySSym:
617  if (Lx.PreSpaces > 0) { ChA += ' '; }
618  ChA += Lx.ChA;
619  default: break;
620  }
621  Lx.GetSym();
622  }
623  return ChA;
624 }
625 
627 // Html-Token
629  if ((Sym==hsyBTag)&&(ArgNmValV.Len()>0)){
630  TChA FullChA;
631  FullChA+=Str.GetSubStr(0, Str.Len()-2);
632  for (int ArgNmValN=0; ArgNmValN<ArgNmValV.Len(); ArgNmValN++){
633  FullChA+=' '; FullChA+=ArgNmValV[ArgNmValN].Key; FullChA+='=';
634  FullChA+='"'; FullChA+=ArgNmValV[ArgNmValN].Dat; FullChA+='"';
635  }
636  FullChA+='>';
637  return FullChA;
638  } else
639  if (Sym==hsyETag){
640  TChA FullChA;
641  FullChA+='<'; FullChA+='/'; FullChA+=Str.GetSubStr(1, Str.Len()-1);
642  return FullChA;
643  } else {
644  return GetStr();
645  }
646 }
647 
648 bool THtmlTok::IsUrlTok(TStr& RelUrlStr) const {
649  if (GetSym()==hsyBTag){
650  TStr TagNm=GetStr();
651  if ((TagNm==ATagNm)&&(IsArg(HRefArgNm))){
652  RelUrlStr=GetArg(HRefArgNm); return true;}
653  else if ((TagNm==AreaTagNm)&&(IsArg(HRefArgNm))){
654  RelUrlStr=GetArg(HRefArgNm); return true;}
655  else if ((TagNm==FrameTagNm)&&(IsArg(SrcArgNm))){
656  RelUrlStr=GetArg(SrcArgNm); return true;}
657  else if ((TagNm==ImgTagNm)&&(IsArg(SrcArgNm))){
658  RelUrlStr=GetArg(SrcArgNm); return true;}
659  else if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
660  TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
661  if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
662  TStr ContentStr=GetArg("CONTENT");
663  TStr LeftStr; TStr RightStr; TStr UrlEqStr="URL=";
664  ContentStr.GetUc().SplitOnStr(LeftStr, UrlEqStr, RightStr);
665  RelUrlStr=ContentStr.GetSubStr(
666  LeftStr.Len()+UrlEqStr.Len(), ContentStr.Len());
667  return !RelUrlStr.Empty();
668  } else {
669  return false;
670  }
671  }
672  }
673  return false;
674 }
675 
677  if (GetSym()==hsyBTag){
678  TStr TagNm=GetStr();
679  if ((TagNm==MetaTagNm)&&(IsArg(HttpEquivArgNm))){
680  TStr HttpEquivArgVal=GetArg(HttpEquivArgNm).GetUc();
681  if ((HttpEquivArgVal=="REFRESH")&&IsArg("CONTENT")){
682  return true;
683  } else {
684  return false;
685  }
686  }
687  }
688  return false;
689 }
690 
691 void THtmlTok::SaveTxt(const PSOut& SOut, const bool& TxtMode){
692  if (TxtMode){
693  SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
694  } else {
695  SOut->PutStr(THtmlLx::GetSymStr(Sym)); SOut->PutStr(" ");
696  SOut->PutStr(GetFullStr()); SOut->PutStr(" ");
697  }
698 }
699 
700 const TStr THtmlTok::ATagNm="<A>";
701 const TStr THtmlTok::AreaTagNm="<AREA>";
702 const TStr THtmlTok::BrTagNm="<BR>";
703 const TStr THtmlTok::CardTagNm="<CARD>";
704 const TStr THtmlTok::CenterTagNm="<CENTER>";
705 const TStr THtmlTok::FrameTagNm="<FRAME>";
706 const TStr THtmlTok::H1TagNm="<H1>";
707 const TStr THtmlTok::H2TagNm="<H2>";
708 const TStr THtmlTok::H3TagNm="<H3>";
709 const TStr THtmlTok::H4TagNm="<H4>";
710 const TStr THtmlTok::H5TagNm="<H5>";
711 const TStr THtmlTok::H6TagNm="<H6>";
712 const TStr THtmlTok::ImgTagNm="<IMG>";
713 const TStr THtmlTok::LiTagNm="<LI>";
714 const TStr THtmlTok::MetaTagNm="<META>";
715 const TStr THtmlTok::PTagNm="<P>";
716 const TStr THtmlTok::UlTagNm="<UL>";
717 const TStr THtmlTok::TitleTagNm="<TITLE>";
718 const TStr THtmlTok::TitleETagNm="</TITLE>";
719 
720 const TStr THtmlTok::AltArgNm="ALT";
721 const TStr THtmlTok::HRefArgNm="HREF";
722 const TStr THtmlTok::SrcArgNm="SRC";
723 const TStr THtmlTok::TitleArgNm="TITLE";
724 const TStr THtmlTok::HttpEquivArgNm="HTTP-EQUIV";
725 
726 bool THtmlTok::IsBreakTag(const TStr& TagNm){
727  static TStrH BreakTagNmH(50);
728  if (BreakTagNmH.Len()==0){
729  BreakTagNmH.AddKey(TStr("<H1>")); BreakTagNmH.AddKey(TStr("<H2>"));
730  BreakTagNmH.AddKey(TStr("<H3>")); BreakTagNmH.AddKey(TStr("<H4>"));
731  BreakTagNmH.AddKey(TStr("<H5>")); BreakTagNmH.AddKey(TStr("<H6>"));
732  BreakTagNmH.AddKey(TStr("<BR>")); BreakTagNmH.AddKey(TStr("<HR>"));
733  BreakTagNmH.AddKey(TStr("<P>")); BreakTagNmH.AddKey(TStr("<DL>"));
734  BreakTagNmH.AddKey(TStr("<UL>")); BreakTagNmH.AddKey(TStr("<OL>"));
735  BreakTagNmH.AddKey(TStr("<LI>")); BreakTagNmH.AddKey(TStr("<DT>"));
736  BreakTagNmH.AddKey(TStr("<DD>")); BreakTagNmH.AddKey(TStr("<HEAD>"));
737  BreakTagNmH.AddKey(TStr("<TITLE>")); BreakTagNmH.AddKey(TStr("<META>"));
738  BreakTagNmH.AddKey(TStr("<SCRIPT>"));
739  BreakTagNmH.AddKey(TStr("<HEAD>")); BreakTagNmH.AddKey(TStr("<BODY>"));
740  }
741  return BreakTagNmH.IsKey(TagNm);
742 }
743 
745  if ((Tok->GetSym()==hsyBTag)||(Tok->GetSym()==hsyETag)){
746  return IsBreakTag(Tok->GetStr());
747  } else {
748  return false;
749  }
750 }
751 
752 bool THtmlTok::IsHTag(const TStr& TagNm, int& HTagN){
753  if ((TagNm.Len()==4)&&(TagNm[0]=='<')&&(TagNm[1]=='H')&&(TagNm[3]=='>')){
754  char Ch=TagNm[2];
755  if (('1'<=Ch)&&(Ch<='6')){HTagN=Ch-'0'; return true;}
756  else {HTagN=-1; return false;}
757  } else {
758  HTagN=-1; return false;
759  }
760 }
761 
762 PHtmlTok THtmlTok::GetHTok(const bool& IsBTag, const int& HTagN){
763  THtmlLxSym HTagSym=IsBTag?hsyBTag:hsyETag;
764  TStr HTagNm;
765  switch (HTagN){
766  case 1: HTagNm=H1TagNm; break;
767  case 2: HTagNm=H2TagNm; break;
768  case 3: HTagNm=H3TagNm; break;
769  case 4: HTagNm=H4TagNm; break;
770  case 5: HTagNm=H5TagNm; break;
771  case 6: HTagNm=H6TagNm; break;
772  default: Fail;
773  }
774  return PHtmlTok(new THtmlTok(HTagSym, HTagNm));
775 }
776 
778 // Html-Document
779 THtmlDoc::THtmlDoc(const PSIn& SIn, const THtmlDocType& Type, const bool& DoUc):
780  TokV(1000, 0){
781  THtmlLx Lx(SIn);
782  bool MkTok=false; bool InUL=false;
783  while (Lx.GetSym()!=hsyEof){
784  switch (Type){
785  case hdtAll: MkTok=true; break;
786  case hdtStr: MkTok=(Lx.Sym==hsyStr); break;
787  case hdtStrNum: MkTok=(Lx.Sym==hsyStr)||(Lx.Sym==hsyNum); break;
788  case hdtTag: MkTok=(Lx.Sym==hsyBTag)||(Lx.Sym==hsyETag); break;
789  case hdtA: MkTok=(Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::ATagNm); break;
790  case hdtHRef:
791  MkTok=(Lx.Sym==hsyBTag)&&
792  ((Lx.UcChA==THtmlTok::ATagNm)||(Lx.UcChA==THtmlTok::AreaTagNm)||
793  (Lx.UcChA==THtmlTok::FrameTagNm)||(Lx.UcChA==THtmlTok::ImgTagNm)||
794  (Lx.UcChA==THtmlTok::MetaTagNm));
795  break;
796  case hdtUL:
797  if ((Lx.Sym==hsyBTag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=true;}
798  MkTok=InUL;
799  if ((Lx.Sym==hsyETag)&&(Lx.UcChA==THtmlTok::UlTagNm)){InUL=false;}
800  break;
801  default: Fail;
802  }
803  if (MkTok){TokV.Add(Lx.GetTok(DoUc));}
804  }
806 }
807 
809  TChA LnDocChA;
810  // prepare html parsing
811  PSIn HtmlSIn=TStrIn::New(HtmlStr);
812  THtmlLx HtmlLx(HtmlSIn);
813  bool InScript=false;
814  // save text
815  while (HtmlLx.GetSym()!=hsyEof){
816  TStr Str=HtmlLx.ChA;
817  switch (HtmlLx.Sym){
818  case hsyStr:
819  case hsyNum:
820  case hsySSym:
821  if (InScript){break;}
822  if (HtmlLx.PreSpaces>0){LnDocChA+=' ';}
823  LnDocChA+=Str.CStr();
824  break;
825  case hsyBTag:
826  if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
827  if ((!InScript)&&(Str=="<SCRIPT>")){InScript=true;}
828  break;
829  case hsyETag:
830  if ((!LnDocChA.Empty())&&(LnDocChA.LastCh()!=' ')){LnDocChA+=' ';}
831  if ((InScript)&&(Str=="<SCRIPT>")){InScript=false;}
832  break;
833  default: break;
834  }
835  }
836  // return result
837  return LnDocChA;
838 }
839 
841  const TStr& BaseUrlStr, const bool& OutUrlP, const bool& OutTagsP){
842  // prepare output-string
843  TChA OutChA; OutChA+=' ';
844  // prepare html parsing
845  PSIn HtmlSIn=TStrIn::New(HtmlStr);
846  THtmlLx HtmlLx(HtmlSIn);
847  bool InScript=false;
848  // save text
849  while (HtmlLx.GetSym()!=hsyEof){
850  TStr Str=HtmlLx.ChA;
851  switch (HtmlLx.Sym){
852  case hsyUndef:
853  case hsyUrl:
854  case hsyMTag:
855  break;
856  case hsyStr:
857  case hsyNum:
858  case hsySSym:
859  if (InScript){break;}
860  if (HtmlLx.PreSpaces>0){if (OutChA.LastCh()!=' '){OutChA+=' ';}}
861  OutChA+=Str;
862  break;
863  case hsyBTag:
864  // extract tag name
865  Str=Str.GetSubStr(1, Str.Len()-2);
866  // process tag
867  if (!InScript){
868  // check script tag
869  if (Str=="SCRIPT"){
870  InScript=true; break;}
871  // output tag
872  if (OutTagsP){
873  OutChA+='<'; OutChA+=Str; OutChA+='>';
874  } else {
875  if (OutChA.LastCh()!=' '){OutChA+=' ';}
876  }
877  // check if URL present
878  PHtmlTok Tok=HtmlLx.GetTok();
879  TStr RelUrlStr;
880  if (Tok->IsUrlTok(RelUrlStr)){
881  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
882  if (Url->IsOk()){
883  if (OutUrlP){
884  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
885  OutChA+="<Url>"; OutChA+=XmlUrlStr; OutChA+="</Url>";
886  }
887  }
888  }
889  }
890  break;
891  case hsyETag:
892  // extract tag name
893  Str=Str.GetSubStr(1, Str.Len()-2);
894  // process tag
895  if (InScript){
896  if (Str=="SCRIPT"){
897  InScript=false; break;}
898  } else {
899  if (OutTagsP){
900  OutChA+="</"; OutChA+=Str; OutChA+='>';
901  } else {
902  if (OutChA.LastCh()!=' '){OutChA+=' ';}
903  }
904  }
905  break;
906  case hsyEof: break;
907  default: Fail;
908  }
909  }
910  // return string
911  return OutChA;
912 }
913 
914 
915 void THtmlDoc::SaveTxt(const PSOut& SOut, const bool& TxtMode) const {
916  if (TxtMode){
917  for (int TokN=0; TokN<TokV.Len(); TokN++){TokV[TokN]->SaveTxt(SOut);}
918  SOut->PutLn();
919  } else {
920  for (int TokN=0; TokN<TokV.Len(); TokN++){
921  SOut->PutStr(TInt::GetStr(TokN)); SOut->PutStr(": ");
922  TokV[TokN]->SaveTxt(SOut);
923  SOut->PutLn();
924  }
925  }
926 }
927 
929  const TStr& HtmlStr, const PSOut& TxtSOut, const TStr& BaseUrlStr,
930  const bool& OutUrlP, const bool& OutTagsP){
931  // get text-string from html-string
932  TStr TxtStr=GetTxtLnDoc(HtmlStr, BaseUrlStr, OutUrlP, OutTagsP);
933  // save text-string
934  TxtStr.SaveTxt(TxtSOut);
935 }
936 
938  const TStr& HtmlStr, const TStr& TxtFNm, const TStr& BaseUrlStr,
939  const bool& OutUrlP, const bool& OutTagsP){
940  // create output file
941  PSOut TxtSOut=TFOut::New(TxtFNm);
942  // save to output file
943  SaveHtmlToTxt(HtmlStr, TxtSOut, BaseUrlStr, OutUrlP, OutTagsP);
944 }
945 
947  const TStr& HtmlStr, const PSOut& XmlSOut, const TStr& BaseUrlStr,
948  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
949  const bool& OutTagsP, const bool& OutArgsP){
950  // prepare output-file-id
951  TFileId fXml=XmlSOut->GetFileId();
952  // create outgoing url
953  TStrV OutUrlStrV;
954  // open top tag
955  fprintf(fXml, "<HtmlDoc>\n");
956  // save url
957  if (!BaseUrlStr.Empty()){
958  TStr XmlBaseUrlStr=TXmlLx::GetXmlStrFromPlainStr(BaseUrlStr);
959  fprintf(fXml, "<BaseUrl>%s</BaseUrl>\n", XmlBaseUrlStr.CStr());
960  }
961  // prepare html parsing
962  PSIn HtmlSIn=TStrIn::New(HtmlStr);
963  THtmlLx HtmlLx(HtmlSIn);
964  TChA ContTextChA; bool InScript=false;
965  // save text
966  fprintf(fXml, "<Body>\n");
967  while (HtmlLx.GetSym()!=hsyEof){
968  TStr Str=HtmlLx.ChA;
969  switch (HtmlLx.Sym){
970  case hsyUndef:
971  case hsyUrl:
972  case hsyMTag:
973  break;
974  case hsyStr:
975  if (InScript){break;}
977  if (OutToksP){
978  fprintf(fXml, " <Str>%s</Str>\n", Str.CStr());}
979  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
980  break;
981  case hsyNum:
982  if (InScript){break;}
984  if (OutToksP){
985  fprintf(fXml, " <Num>%s</Num>\n", Str.CStr());}
986  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
987  break;
988  case hsySSym:
989  if (InScript){break;}
991  if (OutToksP){
992  fprintf(fXml, " <Sym>%s</Sym>\n", Str.CStr());}
993  if (!ContTextChA.Empty()){ContTextChA+=' ';} ContTextChA+=Str;
994  break;
995  case hsyBTag:{
996  // save continuos text
997  if (!ContTextChA.Empty()){
998  if (OutTextP){
999  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1000  ContTextChA.Clr();
1001  }
1002  // extract tag name
1003  Str=Str.GetSubStr(1, Str.Len()-2);
1005  // process tag
1006  if (!InScript){
1007  // check script tag
1008  if (Str=="SCRIPT"){
1009  InScript=true; break;}
1010  // output tag
1011  if (OutTagsP){
1012  if (OutArgsP){
1013  fprintf(fXml, " <BTag Nm=\"%s\">\n", Str.CStr());
1014  for (int ArgN=0; ArgN<HtmlLx.GetArgs(); ArgN++){
1015  TStr ArgNm=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgNm(ArgN));
1016  TStr ArgVal=TXmlLx::GetXmlStrFromPlainStr(HtmlLx.GetArgVal(ArgN));
1017  fprintf(fXml, " <Arg Nm=\"%s\" Val=\"%s\"/>", ArgNm.CStr(), ArgVal.CStr());
1018  }
1019  fprintf(fXml, " </BTag>\n");
1020  } else {
1021  fprintf(fXml, " <BTag Nm=\"%s\"/>\n", Str.CStr());
1022  }
1023  }
1024  // check if URL present
1025  PHtmlTok Tok=HtmlLx.GetTok();
1026  TStr RelUrlStr;
1027  if (Tok->IsUrlTok(RelUrlStr)){
1028  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
1029  if (Url->IsOk()){
1030  OutUrlStrV.Add(Url->GetUrlStr());
1031  if (OutUrlP){
1032  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(Url->GetUrlStr());
1033  fprintf(fXml, " <Url>%s</Url>\n", XmlUrlStr.CStr());
1034  }
1035  }
1036  }
1037  }
1038  break;}
1039  case hsyETag:{
1040  // save continuos text
1041  if (!ContTextChA.Empty()){
1042  if (OutTextP){
1043  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1044  ContTextChA.Clr();
1045  }
1046  // extract tag name
1047  Str=Str.GetSubStr(1, Str.Len()-2);
1049  // process tag
1050  if (InScript){
1051  if (Str=="SCRIPT"){
1052  InScript=false; break;}
1053  } else {
1054  if (OutTagsP){
1055  fprintf(fXml, " <ETag Nm=\"%s\"/>\n", Str.CStr());}
1056  }
1057  break;}
1058  case hsyEof: break;
1059  default: Fail;
1060  }
1061  }
1062  // save continuos text
1063  if (!ContTextChA.Empty()){
1064  if (OutTextP){
1065  fprintf(fXml, " <Text>%s</Text>\n", ContTextChA.CStr());}
1066  ContTextChA.Clr();
1067  }
1068  fprintf(fXml, "</Body>\n");
1069  // save outgoing urls
1070  fprintf(fXml, "<OutUrls>\n");
1071  for (int UrlN=0; UrlN<OutUrlStrV.Len(); UrlN++){
1072  TStr XmlUrlStr=TXmlLx::GetXmlStrFromPlainStr(OutUrlStrV[UrlN]);
1073  fprintf(fXml, " <Url N=\"%d\">%s</Url>\n", 1+UrlN, XmlUrlStr.CStr());
1074  }
1075  fprintf(fXml, "</OutUrls>\n");
1076 
1077  // close top tag
1078  fprintf(fXml, "</HtmlDoc>\n");
1079 }
1080 
1082  const TStr& HtmlStr, const TStr& XmlFNm, const TStr& BaseUrlStr,
1083  const bool& OutTextP, const bool& OutUrlP, const bool& OutToksP,
1084  const bool& OutTagsP, const bool& OutArgsP){
1085  // create output file
1086  PSOut XmlSOut=TFOut::New(XmlFNm);
1087  // save to output file
1088  SaveHtmlToXml(HtmlStr, XmlSOut, BaseUrlStr, OutTextP, OutUrlP,
1089  OutToksP, OutTagsP, OutArgsP);
1090 }
1091 
1092 TLxSym THtmlDoc::GetLxSym(const THtmlLxSym& HtmlLxSym, const TChA& ChA){
1093  switch (HtmlLxSym){
1094  case hsyUndef: return syUndef;
1095  case hsyStr: return syStr;
1096  case hsyNum: return syFlt;
1097  case hsySSym: return TLxSymStr::GetSSym(ChA);
1098  case hsyUrl: return syStr;
1099  case hsyBTag: return syStr;
1100  case hsyETag: return syStr;
1101  case hsyEof: return syEof;
1102  default: Fail; return syUndef;
1103  }
1104 }
1105 
1107  const TStr& TagStr, const TStr& ArgNm, THtmlLx& Lx,
1108  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
1109  IAssert(Lx.Sym==hsyBTag);
1110  if ((Lx.ChA==TagStr)&&(Lx.IsArg(ArgNm))){
1111  TStr RelUrlStr=Lx.GetArg(ArgNm);
1112  PUrl Url=TUrl::New(RelUrlStr, BaseUrlStr);
1113  if (Url->IsOk(usHttp)){
1114  TStr UrlStr=Url->GetUrlStr();
1115  PUrlEnv RedirUrlEnv=TUrlEnv::New(RedirUrlStr, "url", UrlStr);
1116  Lx.PutArg(ArgNm, RedirUrlEnv->GetFullUrlStr());
1117  return true;
1118  } else {
1119  return false;
1120  }
1121  } else {
1122  return false;
1123  }
1124 }
1125 
1127  const TStr& BaseUrlStr, const TStr& RedirUrlStr){
1128  PSIn SIn=TStrIn::New(HtmlStr);
1129  TMOut SOut;
1130  THtmlLx Lx(SIn);
1131  while (Lx.GetSym()!=hsyEof){
1132  SOut.PutStr(Lx.PreSpaceChA);
1133  if ((Lx.Sym==hsyBTag)&&(
1134  (_IsTagRedir(THtmlTok::ATagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1135  (_IsTagRedir(THtmlTok::AreaTagNm, THtmlTok::HRefArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1136  (_IsTagRedir(THtmlTok::FrameTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr))||
1137  (_IsTagRedir(THtmlTok::ImgTagNm, THtmlTok::SrcArgNm, Lx, BaseUrlStr, RedirUrlStr)))){
1138  SOut.PutStr(Lx.GetFullBTagStr());
1139  } else {
1140  SOut.PutStr(Lx.SymChA());
1141  }
1142  }
1143  return SOut.GetAsStr();
1144 }
1145 
1147 // Html-Hyper-Link-Document-Vector
1148 THtmlHldV::THtmlHldV(const PHtmlDoc& _RefHtmlDoc, const int& HldWnLen):
1149  RefHtmlDoc(_RefHtmlDoc), HldV(){
1150  bool IsTitleAct=false; THtmlTokV TitleTokV;
1151  bool IsHAct=false; int ActHTagN=-1;
1152  TVec<THtmlTokV> HTokV(6);
1153  PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
1154  for (int TokN=0; TokN<RefHtmlDoc->GetToks(); TokN++){
1155  Tok=RefHtmlDoc->GetTok(TokN, TokSym, TokStr);
1156  if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
1157  // collect tokens before, inside and after <a> ... </a> tags
1158  int ATokN; PHtmlTok ATok; THtmlLxSym ATokSym; TStr ATokStr;
1159  // inside <A> tags
1160  THtmlTokV ATokV; ATokN=TokN;
1161  forever{
1162  ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
1163  if (ATokSym!=hsySSym){ATokV.Add(ATok);}
1164  if ((ATokSym==hsyETag)&&(ATokStr==THtmlTok::ATagNm)){break;}
1165  ATokN++;
1166  if (ATokN>=RefHtmlDoc->GetToks()){break;}
1167  }
1168  int ETagATokN=ATokN+1;
1169  // before <A> tags
1170  THtmlTokV PrevATokV; ATokN=TokN;
1171  forever{
1172  ATokN--;
1173  if (ATokN<0){break;}
1174  ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
1175  if (THtmlTok::IsBreakTok(ATok)){break;}
1176  if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){PrevATokV.Add(ATok);}
1177  if (ATokV.Len()>=HldWnLen){break;}
1178  }
1179  // after <A> tags
1180  THtmlTokV NextATokV; ATokN=ETagATokN;
1181  forever{
1182  ATokN++;
1183  if (ATokN>=RefHtmlDoc->GetToks()){break;}
1184  ATok=RefHtmlDoc->GetTok(ATokN, ATokSym, ATokStr);
1185  if (THtmlTok::IsBreakTok(ATok)){break;}
1186  if ((ATokSym==hsyStr)||(ATokSym==hsyNum)){NextATokV.Add(ATok);}
1187  if (ATokV.Len()>=HldWnLen){break;}
1188  }
1189  // construct html-document with hyper-link context
1190  PHtmlDoc HtmlDoc=PHtmlDoc(new THtmlDoc());
1191  HtmlDoc->AddTokV(TitleTokV);
1192  for (int HTagN=1; HTagN<=6; HTagN++){HtmlDoc->AddTokV(HTokV[HTagN-1]);}
1193  HtmlDoc->AddTokV(PrevATokV);
1194  HtmlDoc->AddTokV(ATokV);
1195  HtmlDoc->AddTokV(NextATokV);
1196  HldV.Add(HtmlDoc);
1197  HtmlDoc->SaveTxt(TSOut::StdOut);
1198  } else
1199  if (TokSym==hsyBTag){
1200  int HTagN;
1201  if (TokStr==THtmlTok::TitleTagNm){
1202  IsTitleAct=true; TitleTokV.Clr(); TitleTokV.Add(Tok);
1203  } else
1204  if (THtmlTok::IsHTag(TokStr, HTagN)){
1205  if (IsHAct){// conclude previous <H?> tag if left open
1206  HTokV[ActHTagN-1].Add(THtmlTok::GetHTok(false, ActHTagN));}
1207  IsHAct=true; ActHTagN=HTagN;
1208  {for (int HTagN=ActHTagN; HTagN<=6; HTagN++){HTokV[HTagN-1].Clr();}}
1209  HTokV[ActHTagN-1].Add(Tok);
1210  }
1211  } else
1212  if (TokSym==hsyETag){
1213  int HTagN;
1214  if (TokStr==THtmlTok::TitleTagNm){
1215  if (IsTitleAct){TitleTokV.Add(Tok); IsTitleAct=false;}
1216  } else
1217  if (THtmlTok::IsHTag(TokStr, HTagN)){
1218  if (IsHAct){HTokV[ActHTagN-1].Add(Tok); IsHAct=false;}
1219  }
1220  } else
1221  if (TokSym!=hsySSym){
1222  if (IsTitleAct){TitleTokV.Add(Tok);}
1223  if (IsHAct){HTokV[ActHTagN-1].Add(Tok);}
1224  }
1225  }
1226 }
1227 
1229 // Web-Page
1230 void TWebPg::GetOutUrlV(TUrlV& OutUrlV, TUrlV& OutRedirUrlV) const {
1231  // create outgoing url vector
1232  OutUrlV.Clr(); OutRedirUrlV.Clr();
1233  // take interesting web-page components
1234  TStr UrlStr=GetUrlStr();
1235  TStr HtmlStr=GetHttpBodyAsStr();
1236  // prepare html parsing
1237  PSIn HtmlSIn=TStrIn::New(HtmlStr);
1238  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
1239  PHtmlTok Tok;
1240  // traverse html
1241  for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
1242  PHtmlTok Tok=HtmlDoc->GetTok(TokN);
1243  if (Tok->GetSym()==hsyBTag){
1244  TStr RelUrlStr;
1245  if (Tok->IsUrlTok(RelUrlStr)){
1246  PUrl Url=TUrl::New(RelUrlStr, UrlStr);
1247  if (Url->IsOk(usHttp)){
1248  OutUrlV.Add(Url);
1249  if (Tok->IsRedirUrlTok()){
1250  OutRedirUrlV.Add(Url);
1251  }
1252  }
1253  }
1254  }
1255  }
1256 }
1257 
1258 void TWebPg::GetOutDescUrlStrKdV(TStrKdV& OutDescUrlStrKdV) const {
1259  // create outgoing url vector
1260  OutDescUrlStrKdV.Clr();
1261  // take interesting web-page components
1262  TStr UrlStr=GetUrlStr();
1263  TStr HtmlStr=GetHttpBodyAsStr();
1264  // prepare html parsing
1265  PSIn HtmlSIn=TStrIn::New(HtmlStr);
1266  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
1267  // traverse html documents
1268  PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
1269  int TokN=0; int Toks=HtmlDoc->GetToks();
1270  while (TokN<Toks){
1271  Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
1272  if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
1273  TStr RelUrlStr;
1274  if (Tok->IsUrlTok(RelUrlStr)){
1275  PUrl Url=TUrl::New(RelUrlStr, UrlStr);
1276  if (Url->IsOk()){
1277  TChA DescChA;
1278  while (TokN<Toks){
1279  Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
1280  if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
1281  break;
1282  } else {
1283  if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
1284  if (!DescChA.Empty()){DescChA+=' ';}
1285  DescChA+=TokStr;
1286  }
1287  }
1288  }
1289  OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
1290  }
1291  }
1292  }
1293  }
1294 }
1295 
1296 void TWebPg::SaveAsHttpBody(const TStr& FNm) const {
1297  // create output file
1298  PSOut SOut=TFOut::New(FNm);
1299  // save http-body
1300  HttpResp->SaveBody(SOut);
1301 }
1302 
1303 void TWebPg::SaveAsHttp(const TStr& FNm) const {
1304  // create output file
1305  PSOut SOut=TFOut::New(FNm);
1306  // save http
1307  HttpResp->SaveTxt(SOut);
1308 }
1309 
1310 bool TWebPg::IsTxt() const {
1312  TStr Str=HttpResp->GetBodyAsStr();
1313  int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
1314  while ((ChN<100)&&(ChN<StrLen)){
1315  char Ch=Str[ChN++];
1316  if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
1317  PrintChs++;}
1318  }
1319  double PrintPrb=double(PrintChs)/double(ChN+1);
1320  return PrintPrb>0.9;
1321  } else {
1322  return false;
1323  }
1324 }
1325 
PHtmlDoc RefHtmlDoc
Definition: html.h:313
Definition: html.h:252
THtmlDocType
Definition: html.h:251
#define IAssert(Cond)
Definition: bd.h:262
static const TStr H5TagNm
Definition: html.h:227
THtmlLxSym
Definition: html.h:78
bool IsGetETag(const TStr &TagNm)
Definition: html.cpp:547
static const TStr H4TagNm
Definition: html.h:226
static TStr GetCSZFromYuascii(const TChA &ChA)
Definition: html.cpp:111
TChA ArgNm
Definition: html.h:93
TStr GetHRefBeforeStr(const TStr &Str)
Definition: html.cpp:530
TStr GetStr() const
Definition: dt.h:1200
static TStr GetWin1250FromYuascii(const TChA &ChA)
Definition: html.cpp:149
static const TStr FrameTagNm
Definition: html.h:222
static THtmlLxChDef ChDef
Definition: html.h:84
THtmlTok()
Definition: html.h:188
static const TStr H3TagNm
Definition: html.h:225
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
bool IsWs(const char &Ch) const
Definition: html.h:36
int Len() const
Definition: dt.h:490
TIntV ChTyV
Definition: html.h:16
static const TStr H1TagNm
Definition: html.h:223
static const TStr TitleArgNm
Definition: html.h:240
Definition: html.h:252
static const TStr LiTagNm
Definition: html.h:230
TChA EscChA
Definition: html.h:92
Definition: html.h:252
Definition: html.h:79
void MoveToETagOrEof(const TStr &TagNm)
Definition: html.cpp:441
static const TStr HRefArgNm
Definition: html.h:238
TStr GetFullBTagStr() const
Definition: html.cpp:358
TStr GetUrlStr(const int &UrlN=-1) const
Definition: html.h:355
void GetOutDescUrlStrKdV(TStrKdV &OutDescUrlStrKdV) const
Definition: html.cpp:1258
Definition: url.h:5
int SymBChX
Definition: html.h:108
void PutStr(const TStr &Str)
Definition: html.h:128
bool IsNum(const char &Ch) const
Definition: html.h:40
#define forever
Definition: bd.h:6
static bool IsBreakTok(const PHtmlTok &Tok)
Definition: html.cpp:744
Definition: html.h:182
THtmlDoc()
Definition: html.h:258
void MoveToBTagArgOrEof(const TStr &TagNm, const TStr &ArgNm, const TStr &ArgVal)
Definition: html.cpp:400
TStr GetEscStr(const TStr &Str) const
Definition: html.cpp:33
bool Empty() const
Definition: dt.h:260
#define Fail
Definition: bd.h:238
static void GetTokStrV(const TStr &Str, TStrV &TokStrV)
Definition: html.cpp:595
TStr GetUc() const
Definition: dt.h:496
void SaveAsHttp(const TStr &FNm) const
Definition: html.cpp:1303
Definition: html.h:79
TLxSym
Definition: lx.h:44
void Clr()
Definition: dt.h:258
void AddCh(const char &Ch, const int &MxLen=-1)
Definition: dt.h:271
TStr GetFullStr() const
Definition: html.cpp:628
bool IsEoln(const char &Ch) const
Definition: html.h:35
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
void SaveTxt(const PSOut &SOut) const
Definition: http.h:205
void PutCh(const char &_Ch)
Definition: html.h:126
bool DoParseArg
Definition: html.h:87
THtmlLxSym GetSym()
Definition: html.cpp:277
TStr GetArg(const TStr &ArgNm) const
Definition: html.h:207
static const TStr MetaTagNm
Definition: html.h:231
Definition: html.h:12
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
int Len() const
Definition: dt.h:259
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
void MoveToBTagArg2OrEof(const TStr &TagNm, const TStr &ArgNm1, const TStr &ArgVal1, const TStr &ArgNm2, const TStr &ArgVal2, const bool &AndOpP=true)
Definition: html.cpp:410
static TStr GetEscapedStr(const TChA &ChA)
Definition: html.cpp:568
bool IsOk(const TUrlScheme _Scheme=usUndef) const
Definition: url.h:32
static bool IsBreakTag(const TStr &TagNm)
Definition: html.cpp:726
TStr GetStr() const
Definition: html.h:203
static PHtmlLxChDef ChDef
Definition: html.h:65
static const TStr TitleTagNm
Definition: html.h:234
char Ch
Definition: html.h:89
int PutLn(const int &Lns=1)
Definition: fl.cpp:158
void GetTag()
Definition: html.cpp:236
bool IsArg(const TStr &ArgNm) const
Definition: html.h:205
bool IsGetBTag(const TStr &TagNm)
Definition: html.cpp:541
TStr GetAsStr() const
Definition: fl.cpp:869
static const TStr AreaTagNm
Definition: html.h:218
TPt< THtmlDoc > PHtmlDoc
Definition: html.h:6
static TLxSym GetLxSym(const THtmlLxSym &HtmlLxSym, const TChA &ChA)
Definition: html.cpp:1092
THtmlLxSym GetSym() const
Definition: html.h:202
Definition: html.h:11
TStr Str
Definition: html.h:185
void SaveAsHttpBody(const TStr &FNm) const
Definition: html.cpp:1296
Definition: html.h:11
TChA PreSpaceChA
Definition: html.h:113
static const TStr HttpEquivArgNm
Definition: html.h:241
TStr GetArgVal(const int &ArgN) const
Definition: html.h:137
void SetChTy(const THtmlLxChTy &ChTy, const TStr &Str)
Definition: html.cpp:24
static const char EofCh
Definition: dt.h:1040
Definition: html.h:252
static const char Mx
Definition: dt.h:1033
Definition: html.h:252
static const TStr H2TagNm
Definition: html.h:224
static const TStr ATagNm
Definition: html.h:217
static const TStr CenterTagNm
Definition: html.h:221
PHttpResp HttpResp
Definition: html.h:334
Definition: lx.h:45
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
THtmlTokV TokV
Definition: html.h:256
THtmlLxChDef()
Definition: html.cpp:48
TPt< THtmlLxChDef > PHtmlLxChDef
Definition: html.h:14
bool IsArg(const TStr &ArgNm) const
Definition: html.h:138
char * CStr()
Definition: dt.h:255
static const TStr UlTagNm
Definition: html.h:233
static TStr GetNoTag(const TStr &Str)
Definition: html.cpp:606
virtual TFileId GetFileId() const
Definition: fl.h:146
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1229
Definition: dt.h:1028
static const TStr CardTagNm
Definition: html.h:220
static const char TabCh
Definition: dt.h:1037
static const char Mn
Definition: dt.h:1032
TStr GetArgNm(const int &ArgN) const
Definition: html.h:136
Definition: lx.h:51
static PSIn New(const TStr &Str)
Definition: dt.h:711
TStr GetStrToETag(const TStr &TagNm, const bool &TxtOnlyP=false)
Definition: html.cpp:494
void PutArg(const TStr &ArgNm, const TStr &ArgVal)
Definition: html.h:142
char LastCh() const
Definition: dt.h:281
bool EscCh
Definition: html.h:91
Definition: html.h:80
PHtmlTok GetTok(const bool &DoUc=true)
Definition: html.cpp:353
int SymEChX
Definition: html.h:108
Definition: html.h:12
TStr GetArg(const TStr &ArgNm, const TStr &DfArgVal=TStr()) const
Definition: html.h:139
TStr GetBodyAsStr() const
Definition: http.h:170
int GetToks() const
Definition: html.h:270
Definition: html.h:11
static PUrlEnv New()
Definition: url.h:113
bool IsAlNum(const char &Ch) const
Definition: html.h:41
TStr GetStrInTag(const TStr &TagNm, const bool &TxtOnlyP=false)
Definition: html.cpp:525
static PHtmlTok GetHTok(const bool &IsBTag, const int &HTagN)
Definition: html.cpp:762
void SaveTxt(const PSOut &SOut, const bool &TxtMode=true) const
Definition: html.cpp:915
TStr GetUrlStr() const
Definition: url.h:36
TChA UcChA
Definition: html.h:110
void SetEscStr(const TStr &SrcStr, const TStr &DstStr)
Definition: html.cpp:29
static TStr GetCSZFromWin1250(const TChA &ChA)
Definition: html.cpp:132
static const TStr H6TagNm
Definition: html.h:228
static TStr GetIsoCeFromYuascii(const TChA &ChA)
Definition: html.cpp:170
unsigned char uchar
Definition: bd.h:10
void MoveToBTagOrETagOrEof(const TStr &BTagNm, const TStr &ETagNm)
Definition: html.cpp:394
static const TPt< TSOut > StdOut
Definition: fl.h:208
TChA ChA
Definition: html.h:109
Definition: html.h:79
bool IsRedirUrlTok() const
Definition: html.cpp:676
Definition: lx.h:45
void MoveToBTag3OrEof(const TStr &TagNm1, const TStr &TagNm2, const TStr &TagNm3)
Definition: html.cpp:388
static const char LfCh
Definition: dt.h:1038
int GetChTy(const char &Ch) const
Definition: html.h:34
void GetMetaTag()
Definition: html.cpp:225
static void SaveHtmlToTxt(const TStr &HtmlStr, const PSOut &TxtSOut, const TStr &BaseUrlStr, const bool &OutUrlP, const bool &OutToksP)
Definition: html.cpp:928
TStrStrH EscStrH
Definition: html.h:19
Definition: dt.h:1137
bool IsTxt() const
Definition: html.cpp:1310
int ChX
Definition: html.h:90
bool IsSpace(const char &Ch) const
Definition: html.h:38
int GetKeyId(const TKey &Key) const
Definition: hash.h:466
Definition: fl.h:495
void AddTokV(const THtmlTokV &_TokV)
Definition: html.h:274
Definition: dt.h:201
TStr GetTextOnlyStrToEof()
Definition: html.cpp:447
static bool IsHTag(const TStr &TagNm, int &HTagN)
Definition: html.cpp:752
Definition: html.h:79
TChV LcChV
Definition: html.h:18
Definition: html.h:80
PSIn SIn
Definition: html.h:85
void GetCh()
Definition: html.h:95
Definition: html.h:12
int AddKey(const TKey &Key)
Definition: hash.h:373
static const TStr ImgTagNm
Definition: html.h:229
static const TStr PTagNm
Definition: html.h:232
TStr GetStrToETag2(const TStr &TagNm1, const TStr &TagNm2, const bool &TxtOnlyP=false)
Definition: html.cpp:509
static TLxSym GetSSym(const TStr &Str)
Definition: lx.cpp:186
void MoveToStrOrEof(const TStr &Str)
Definition: html.cpp:370
void GetOutUrlV(TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const
Definition: html.cpp:1230
static void SaveHtmlToXml(const TStr &HtmlStr, const PSOut &XmlSOut, const TStr &BaseUrlStr, const bool &OutTextP, const bool &OutUrlP, const bool &OutToksP, const bool &OutTagsP, const bool &OutArgsP)
Definition: html.cpp:946
bool IsAlpha(const char &Ch) const
Definition: html.h:39
static TStr GetAsciiStr(const TChA &ChA, const char &GenericCh='_')
Definition: html.cpp:584
void GetEscCh()
Definition: html.cpp:195
Definition: html.h:80
static const TStr AltArgNm
Definition: html.h:237
Definition: dt.h:412
TChA SymChA
Definition: html.h:111
THtmlHldV(const PHtmlDoc &_RefHtmlDoc, const int &HldWnLen=10)
Definition: html.cpp:1148
bool Empty() const
Definition: dt.h:491
TPt< THtmlTok > PHtmlTok
Definition: html.h:5
char GetUc(const char &Ch) const
Definition: html.h:52
bool IsUrlTok(TStr &RelUrlStr) const
Definition: html.cpp:648
THtmlLxSym Sym
Definition: html.h:107
Definition: html.h:79
int PutStr(const char *CStr)
Definition: fl.cpp:117
void MoveToBTag2OrEof(const TStr &TagNm1, const TStr &TagNm2)
Definition: html.cpp:382
Definition: html.h:252
static const char CrCh
Definition: dt.h:1039
FILE * TFileId
Definition: bd.h:17
bool IsContType() const
Definition: http.h:192
int GetArgs() const
Definition: html.h:135
Definition: html.h:80
static const TStr TextFldVal
Definition: http.h:25
Definition: html.h:82
Definition: bd.h:196
THtmlLx::TArgNmValV ArgNmValV
Definition: html.h:186
TChV UcChV
Definition: html.h:17
static const TStr BrTagNm
Definition: html.h:219
Definition: html.h:254
THtmlLxChTy
Definition: html.h:10
void SplitOnStr(const TStr &SplitStr, TStrV &StrV) const
Definition: dt.cpp:1008
static TStr GetRedirHtmlDocStr(const TStr &HtmlStr, const TStr &BaseUrlStr, const TStr &RedirUrlStr)
Definition: html.cpp:1126
TArgNmValV ArgNmValV
Definition: html.h:115
TChA ArgVal
Definition: html.h:94
void SaveBody(const PSOut &SOut) const
Definition: http.h:207
static TStr GetTxtLnDoc(const TStr &HtmlStr)
Definition: html.cpp:808
void SaveTxt(const PSOut &SOut, const bool &TxtMode=true)
Definition: html.cpp:691
Definition: html.h:11
THtmlLxSym Sym
Definition: html.h:184
int PreSpaces
Definition: html.h:112
char * CStr()
Definition: dt.h:479
char Pop()
Definition: dt.h:265
void MoveToBTagOrEof(const TStr &TagNm)
Definition: html.cpp:376
bool IsKey(const TKey &Key) const
Definition: hash.h:258
static PHtmlDoc New(const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true)
Definition: html.h:261
Definition: lx.h:45
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
int Len() const
Definition: hash.h:228
static const TStr TitleETagNm
Definition: html.h:235
TDat & AddDat(const TKey &Key)
Definition: hash.h:238
PHtmlTok GetTok(const int &TokN) const
Definition: html.h:271
static const TStr SrcArgNm
Definition: html.h:239
static bool _IsTagRedir(const TStr &TagStr, const TStr &ArgNm, THtmlLx &Lx, const TStr &BaseUrlStr, const TStr &RedirUrlStr)
Definition: html.cpp:1106
THtmlDocV HldV
Definition: html.h:314
void SetUcCh(const char &UcCh, const char &LcCh)
Definition: html.cpp:3
TKeyDat< TStr, TStr > TStrKd
Definition: ds.h:405
static TStr GetSymStr(const THtmlLxSym &Sym)
Definition: html.cpp:553
TStr GetFullUrlStr() const
Definition: url.cpp:445
void SaveTxt(const PSOut &SOut) const
Definition: dt.h:673
static TStr GetXmlStrFromPlainStr(const TChA &PlainChA)
Definition: xml.cpp:968
TStr GetStrToBTag(const TStr &TagNm, const bool &TxtOnlyP=false)
Definition: html.cpp:462
TStr GetHttpBodyAsStr() const
Definition: html.h:368