SNAP Library 6.0, User Reference  2020-12-09 16:24:20
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
TWebPg Class Reference

#include <html.h>

Public Member Functions

 TWebPg ()
 
 TWebPg (const TStrV &_UrlStrV, const TStrV &_IpNumV, const PHttpResp &_HttpResp)
 
 ~TWebPg ()
 
 TWebPg (TSIn &)
 
void Save (TSOut &)
 
TWebPgoperator= (const TWebPg &)
 
int GetUrls () const
 
TStr GetUrlStr (const int &UrlN=-1) const
 
PUrl GetUrl (const int &UrlN=-1) const
 
int GetIps () const
 
TStr GetIpNum (const int &IpN=-1) const
 
PHttpResp GetHttpResp () const
 
TStr GetHttpHdStr () const
 
TStr GetHttpBodyAsStr () const
 
void GetOutUrlV (TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const
 
void GetOutUrlV (TUrlV &OutUrlV) const
 
void GetOutDescUrlStrKdV (TStrKdV &OutDescUrlStrKdV) const
 
void PutFetchMSecs (const uint64 &_FetchMSecs)
 
uint64 GetFetchMSecs () const
 
void SaveAsHttpBody (const TStr &FNm) const
 
void SaveAsHttp (const TStr &FNm) const
 
bool IsTxt () const
 

Static Public Member Functions

static PWebPg New (const TStrV &UrlStrV, const TStrV &IpNumV, const PHttpResp &HttpResp)
 
static PWebPg New (const TStrV &UrlStrV, const PHttpResp &HttpResp)
 
static PWebPg New (const TStr &UrlStr, const PHttpResp &HttpResp)
 
static PWebPg Load (TSIn &)
 

Private Attributes

TCRef CRef
 
TStrV UrlStrV
 
TStrV IpNumV
 
PHttpResp HttpResp
 
uint64 FetchMSecs
 

Friends

class TPt< TWebPg >
 

Detailed Description

Definition at line 330 of file html.h.

Constructor & Destructor Documentation

TWebPg::TWebPg ( )
inline

Definition at line 337 of file html.h.

337 : UrlStrV(), IpNumV(), HttpResp(){}
PHttpResp HttpResp
Definition: html.h:334
TStrV IpNumV
Definition: html.h:333
TStrV UrlStrV
Definition: html.h:332
TWebPg::TWebPg ( const TStrV _UrlStrV,
const TStrV _IpNumV,
const PHttpResp _HttpResp 
)
inline

Definition at line 338 of file html.h.

338  :
339  UrlStrV(_UrlStrV), IpNumV(_IpNumV), HttpResp(_HttpResp){}
PHttpResp HttpResp
Definition: html.h:334
TStrV IpNumV
Definition: html.h:333
TStrV UrlStrV
Definition: html.h:332
TWebPg::~TWebPg ( )
inline

Definition at line 347 of file html.h.

347 {}
TWebPg::TWebPg ( TSIn )
inline

Definition at line 348 of file html.h.

348 {Fail;}
#define Fail
Definition: bd.h:238

Member Function Documentation

uint64 TWebPg::GetFetchMSecs ( ) const
inline

Definition at line 377 of file html.h.

377 {return FetchMSecs;}
uint64 FetchMSecs
Definition: html.h:335
TStr TWebPg::GetHttpBodyAsStr ( ) const
inline

Definition at line 368 of file html.h.

368 {return GetHttpResp()->GetBodyAsStr();}
TStr GetBodyAsStr() const
Definition: http.h:170
PHttpResp GetHttpResp() const
Definition: html.h:366
TStr TWebPg::GetHttpHdStr ( ) const
inline

Definition at line 367 of file html.h.

367 {return GetHttpResp()->GetHdStr();}
TStr GetHdStr() const
Definition: http.h:168
PHttpResp GetHttpResp() const
Definition: html.h:366
PHttpResp TWebPg::GetHttpResp ( ) const
inline

Definition at line 366 of file html.h.

366 {return HttpResp;}
PHttpResp HttpResp
Definition: html.h:334
TStr TWebPg::GetIpNum ( const int &  IpN = -1) const
inline

Definition at line 363 of file html.h.

363  {
364  if (IpN==-1){return IpNumV.Last();} else {return IpNumV[IpN];}}
TStrV IpNumV
Definition: html.h:333
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
int TWebPg::GetIps ( ) const
inline

Definition at line 362 of file html.h.

362 {return IpNumV.Len();}
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TStrV IpNumV
Definition: html.h:333
void TWebPg::GetOutDescUrlStrKdV ( TStrKdV OutDescUrlStrKdV) const

Definition at line 1258 of file html.cpp.

1258  {
1259  // create outgoing url vector
1260  OutDescUrlStrKdV.Clr();
1261  // take interesting web-page components
1262  TStr UrlStr=GetUrlStr();
1263  TStr HtmlStr=GetHttpBodyAsStr();
1264  // prepare html parsing
1265  PSIn HtmlSIn=TStrIn::New(HtmlStr);
1266  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
1267  // traverse html documents
1268  PHtmlTok Tok; THtmlLxSym TokSym; TStr TokStr;
1269  int TokN=0; int Toks=HtmlDoc->GetToks();
1270  while (TokN<Toks){
1271  Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
1272  if ((TokSym==hsyBTag)&&(TokStr==THtmlTok::ATagNm)){
1273  TStr RelUrlStr;
1274  if (Tok->IsUrlTok(RelUrlStr)){
1275  PUrl Url=TUrl::New(RelUrlStr, UrlStr);
1276  if (Url->IsOk()){
1277  TChA DescChA;
1278  while (TokN<Toks){
1279  Tok=HtmlDoc->GetTok(TokN, TokSym, TokStr); TokN++;
1280  if ((TokSym==hsyETag)&&(TokStr==THtmlTok::ATagNm)){
1281  break;
1282  } else {
1283  if ((TokSym==hsyStr)||(TokSym==hsyNum)||(TokSym==hsySSym)){
1284  if (!DescChA.Empty()){DescChA+=' ';}
1285  DescChA+=TokStr;
1286  }
1287  }
1288  }
1289  OutDescUrlStrKdV.Add(TStrKd(DescChA, Url->GetUrlStr()));
1290  }
1291  }
1292  }
1293  }
1294 }
THtmlLxSym
Definition: html.h:78
Definition: html.h:79
TStr GetUrlStr(const int &UrlN=-1) const
Definition: html.h:355
bool Empty() const
Definition: dt.h:260
Definition: html.h:79
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
static const TStr ATagNm
Definition: html.h:217
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
static PSIn New(const TStr &Str)
Definition: dt.h:711
int GetToks() const
Definition: html.h:270
Definition: dt.h:201
Definition: html.h:79
Definition: html.h:80
Definition: html.h:80
Definition: dt.h:412
static PHtmlDoc New(const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true)
Definition: html.h:261
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
PHtmlTok GetTok(const int &TokN) const
Definition: html.h:271
TKeyDat< TStr, TStr > TStrKd
Definition: ds.h:405
TStr GetHttpBodyAsStr() const
Definition: html.h:368
void TWebPg::GetOutUrlV ( TUrlV OutUrlV,
TUrlV OutRedirUrlV 
) const

Definition at line 1230 of file html.cpp.

1230  {
1231  // create outgoing url vector
1232  OutUrlV.Clr(); OutRedirUrlV.Clr();
1233  // take interesting web-page components
1234  TStr UrlStr=GetUrlStr();
1235  TStr HtmlStr=GetHttpBodyAsStr();
1236  // prepare html parsing
1237  PSIn HtmlSIn=TStrIn::New(HtmlStr);
1238  PHtmlDoc HtmlDoc=THtmlDoc::New(HtmlSIn);
1239  PHtmlTok Tok;
1240  // traverse html
1241  for (int TokN=0; TokN<HtmlDoc->GetToks(); TokN++){
1242  PHtmlTok Tok=HtmlDoc->GetTok(TokN);
1243  if (Tok->GetSym()==hsyBTag){
1244  TStr RelUrlStr;
1245  if (Tok->IsUrlTok(RelUrlStr)){
1246  PUrl Url=TUrl::New(RelUrlStr, UrlStr);
1247  if (Url->IsOk(usHttp)){
1248  OutUrlV.Add(Url);
1249  if (Tok->IsRedirUrlTok()){
1250  OutRedirUrlV.Add(Url);
1251  }
1252  }
1253  }
1254  }
1255  }
1256 }
TStr GetUrlStr(const int &UrlN=-1) const
Definition: html.h:355
Definition: url.h:5
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
static PSIn New(const TStr &Str)
Definition: dt.h:711
int GetToks() const
Definition: html.h:270
Definition: html.h:80
Definition: dt.h:412
static PHtmlDoc New(const PSIn &SIn, const THtmlDocType &Type=hdtAll, const bool &DoUc=true)
Definition: html.h:261
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
PHtmlTok GetTok(const int &TokN) const
Definition: html.h:271
TStr GetHttpBodyAsStr() const
Definition: html.h:368
void TWebPg::GetOutUrlV ( TUrlV OutUrlV) const
inline

Definition at line 371 of file html.h.

371  {
372  TUrlV OutRedirUrlV; GetOutUrlV(OutUrlV, OutRedirUrlV);}
void GetOutUrlV(TUrlV &OutUrlV, TUrlV &OutRedirUrlV) const
Definition: html.cpp:1230
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:430
PUrl TWebPg::GetUrl ( const int &  UrlN = -1) const
inline

Definition at line 357 of file html.h.

357  {
358  TStr UrlStr;
359  if (UrlN==-1){UrlStr=UrlStrV.Last();} else {UrlStr=UrlStrV[UrlN];}
360  return TUrl::New(UrlStr);}
static PUrl New(const TStr &RelUrlStr, const TStr &BaseUrlStr=TStr())
Definition: url.h:25
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
Definition: dt.h:412
TStrV UrlStrV
Definition: html.h:332
int TWebPg::GetUrls ( ) const
inline

Definition at line 354 of file html.h.

354 {return UrlStrV.Len();}
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
TStrV UrlStrV
Definition: html.h:332
TStr TWebPg::GetUrlStr ( const int &  UrlN = -1) const
inline

Definition at line 355 of file html.h.

355  {
356  if (UrlN==-1){return UrlStrV.Last();} else {return UrlStrV[UrlN];}}
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
TStrV UrlStrV
Definition: html.h:332
bool TWebPg::IsTxt ( ) const

Definition at line 1310 of file html.cpp.

1310  {
1312  TStr Str=HttpResp->GetBodyAsStr();
1313  int StrLen=Str.Len(); int ChN=0; int PrintChs=0;
1314  while ((ChN<100)&&(ChN<StrLen)){
1315  char Ch=Str[ChN++];
1316  if (((' '<=Ch)&&(Ch<='~'))||(Ch==TCh::TabCh)||(Ch==TCh::LfCh)||(Ch==TCh::CrCh)){
1317  PrintChs++;}
1318  }
1319  double PrintPrb=double(PrintChs)/double(ChN+1);
1320  return PrintPrb>0.9;
1321  } else {
1322  return false;
1323  }
1324 }
int Len() const
Definition: dt.h:490
PHttpResp HttpResp
Definition: html.h:334
static const char TabCh
Definition: dt.h:1037
TStr GetBodyAsStr() const
Definition: http.h:170
static const char LfCh
Definition: dt.h:1038
Definition: dt.h:412
static const char CrCh
Definition: dt.h:1039
bool IsContType() const
Definition: http.h:192
static const TStr TextFldVal
Definition: http.h:25
static PWebPg TWebPg::Load ( TSIn )
inlinestatic

Definition at line 349 of file html.h.

349 {Fail; return NULL;}
#define Fail
Definition: bd.h:238
static PWebPg TWebPg::New ( const TStrV UrlStrV,
const TStrV IpNumV,
const PHttpResp HttpResp 
)
inlinestatic

Definition at line 340 of file html.h.

340  {
341  return new TWebPg(UrlStrV, IpNumV, HttpResp);}
TWebPg()
Definition: html.h:337
static PWebPg TWebPg::New ( const TStrV UrlStrV,
const PHttpResp HttpResp 
)
inlinestatic

Definition at line 342 of file html.h.

342  {
343  return new TWebPg(UrlStrV, TStrV(), HttpResp);}
TWebPg()
Definition: html.h:337
TVec< TStr > TStrV
Definition: ds.h:1599
static PWebPg TWebPg::New ( const TStr UrlStr,
const PHttpResp HttpResp 
)
inlinestatic

Definition at line 344 of file html.h.

344  {
345  TStrV UrlStrV; UrlStrV.Add(UrlStr);
346  return new TWebPg(UrlStrV, TStrV(), HttpResp);}
TWebPg()
Definition: html.h:337
TVec< TStr > TStrV
Definition: ds.h:1599
TStrV UrlStrV
Definition: html.h:332
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
TWebPg& TWebPg::operator= ( const TWebPg )
inline

Definition at line 352 of file html.h.

352 {Fail; return *this;}
#define Fail
Definition: bd.h:238
void TWebPg::PutFetchMSecs ( const uint64 _FetchMSecs)
inline

Definition at line 376 of file html.h.

376 {FetchMSecs=_FetchMSecs;}
uint64 FetchMSecs
Definition: html.h:335
void TWebPg::Save ( TSOut )
inline

Definition at line 350 of file html.h.

350 {Fail;}
#define Fail
Definition: bd.h:238
void TWebPg::SaveAsHttp ( const TStr FNm) const

Definition at line 1303 of file html.cpp.

1303  {
1304  // create output file
1305  PSOut SOut=TFOut::New(FNm);
1306  // save http
1307  HttpResp->SaveTxt(SOut);
1308 }
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
void SaveTxt(const PSOut &SOut) const
Definition: http.h:205
PHttpResp HttpResp
Definition: html.h:334
Definition: bd.h:196
void TWebPg::SaveAsHttpBody ( const TStr FNm) const

Definition at line 1296 of file html.cpp.

1296  {
1297  // create output file
1298  PSOut SOut=TFOut::New(FNm);
1299  // save http-body
1300  HttpResp->SaveBody(SOut);
1301 }
static PSOut New(const TStr &FNm, const bool &Append=false)
Definition: fl.cpp:442
PHttpResp HttpResp
Definition: html.h:334
Definition: bd.h:196
void SaveBody(const PSOut &SOut) const
Definition: http.h:207

Friends And Related Function Documentation

friend class TPt< TWebPg >
friend

Definition at line 330 of file html.h.

Member Data Documentation

TCRef TWebPg::CRef
private

Definition at line 330 of file html.h.

uint64 TWebPg::FetchMSecs
private

Definition at line 335 of file html.h.

PHttpResp TWebPg::HttpResp
private

Definition at line 334 of file html.h.

TStrV TWebPg::IpNumV
private

Definition at line 333 of file html.h.

TStrV TWebPg::UrlStrV
private

Definition at line 332 of file html.h.


The documentation for this class was generated from the following files: