SNAP Library, Developer Reference  2012-10-02 12:56:23
SNAP, a general purpose network analysis and graph mining library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
url.h
Go to the documentation of this file.
00001 
00002 // Url
00003 typedef enum {usUndef, usHttp, usOther} TUrlScheme;
00004 
00005 ClassTPV(TUrl, PUrl, TUrlV)//{
00006 private:
00007   static const TStr UrlHttpPrefixStr;
00008   static const TStr UrlHttpAbsPrefixStr;
00009   TUrlScheme Scheme;
00010   TStr UrlStr, RelUrlStr, BaseUrlStr;
00011   TStr SchemeNm, HostNm;
00012   TStr PortStr, PathStr, SearchStr, FragIdStr;
00013   int PortN;
00014   TStrV PathSegV;
00015   TStr IpNum;
00016   TStr FinalUrlStr, FinalHostNm;
00017   TStr HttpRqStr;
00018   void GetAbs(const TStr& AbsUrlStr);
00019   void GetAbsFromBase(const TStr& RelUrlStr, const TStr& BaseUrlStr);
00020   UndefDefaultCopyAssign(TUrl);
00021 public:
00022   TUrl(const TStr& _RelUrlStr, const TStr& _BaseUrlStr=TStr());
00023   static PUrl New(const TStr& RelUrlStr, const TStr& BaseUrlStr=TStr()){
00024     return PUrl(new TUrl(RelUrlStr, BaseUrlStr));}
00025   ~TUrl(){}
00026   TUrl(TSIn&){Fail;}
00027   static PUrl Load(TSIn&){Fail; return NULL;}
00028   void Save(TSOut&){Fail;}
00029 
00030   bool IsOk(const TUrlScheme _Scheme=usUndef) const {
00031     if (_Scheme==usUndef){return Scheme!=usUndef;}
00032     else {return Scheme==_Scheme;}}
00033   TUrlScheme GetScheme(){return Scheme;}
00034   TStr GetUrlStr() const {return UrlStr;}
00035   TStr GetRelUrlStr() const {return RelUrlStr;}
00036   bool IsBaseUrl(){return !BaseUrlStr.Empty();}
00037   TStr GetBaseUrlStr() const {return BaseUrlStr;}
00038   TStr GetSchemeNm() const {EAssert(IsOk()); return SchemeNm;}
00039   TStr GetHostNm() const {EAssert(IsOk()); return HostNm;}
00040   TStr GetDmNm(const int& MxDmSegs=-1) const;
00041   bool IsPortOk() const { EAssert(IsOk()); return (PortN > 0); }
00042   TStr GetPortStr() const {EAssert(IsOk()); return PortStr;}
00043   int GetPortN() const {EAssert(IsOk()&&(PortN!=-1)); return PortN;}
00044   TStr GetPathStr() const {EAssert(IsOk()); return PathStr;}
00045   int GetPathSegs() const {return PathSegV.Len();}
00046   TStr GetPathSeg(const int& PathSegN) const {return PathSegV[PathSegN];}
00047   TStr GetSearchStr() const {EAssert(IsOk()); return SearchStr;}
00048   TStr GetFragIdStr() const {EAssert(IsOk()); return FragIdStr;}
00049 
00050   bool IsIpNum() const {return !IpNum.Empty();}
00051   void PutIpNum(const TStr& _IpNum){IpNum=_IpNum;}
00052   TStr GetIpNum() const {EAssert(IsIpNum()); return IpNum;}
00053   TStr GetIpNumOrHostNm() const {return IsIpNum() ? GetIpNum() : GetHostNm();}
00054 
00055   bool IsDefFinalUrl() const {
00056     EAssert(IsOk(usHttp)); return !FinalUrlStr.Empty();}
00057   TStr GetFinalUrlStr() const {
00058     EAssert(IsDefFinalUrl()); return FinalUrlStr;}
00059   TStr GetAsFinalUrlStr() const {
00060     if (IsDefFinalUrl()){return FinalUrlStr;} else {return UrlStr;}}
00061   TStr GetFinalHostNm() const {
00062     EAssert(IsDefFinalUrl()); return FinalHostNm;}
00063   TStr GetAsFinalHostNm() const {
00064     if (IsDefFinalUrl()){return FinalHostNm;} else {return HostNm;}}
00065   void DefUrlAsFinal(){
00066     EAssert(IsOk(usHttp)); EAssert(!IsDefFinalUrl());
00067     FinalUrlStr=UrlStr; FinalHostNm=HostNm;}
00068   void DefFinalUrl(const TStr& _FinalHostNm);
00069 
00070   void PutHttpRqStr(const TStr& _HttpRqStr){HttpRqStr=_HttpRqStr;}
00071   TStr GetHttpRqStr() const {return HttpRqStr;}
00072   bool IsHttpRqStr() const {return !HttpRqStr.Empty();}
00073   void ChangeHttpRqStr(const TStr& SrcStr, const TStr& DstStr){
00074     HttpRqStr.ChangeStr(SrcStr, DstStr);}
00075 
00076   bool IsInHost(const TStr& _HostNm) const {
00077     EAssert(IsOk()); return HostNm.GetUc().IsSuffix(_HostNm.GetUc());}
00078   bool IsInPath(const TStr& _PathStr) const {
00079     EAssert(IsOk()); return PathStr.GetUc().IsPrefix(_PathStr.GetUc());}
00080   void ToLcPath();
00081 
00082   static bool IsAbs(const TStr& UrlStr);
00083   static bool IsScript(const TStr& UrlStr);
00084   static bool IsSite(const TStr& UrlStr);
00085 
00086   static PUrl GetUrlFromShortcut(const TStr& ShortcutUrlStr,
00087    const TStr& DfHostNmPrefix, const TStr& DfHostNmSufix);
00088   static TStr GetUrlSearchStr(const TStr& Str);
00089   static TStr DecodeUrlStr(const TStr& UrlStr);
00090   static TStr GetDocStrFromUrlStr(const TStr& UrlStr, const int& Copies=1);
00091   static TStr GetTopDownDocNm(
00092    const TStr& UrlStr, const int& MxLen=-1, const bool& HostOnlyP=false);
00093 };
00094 typedef TPair<TInt, PUrl> TIdUrlPr;
00095 typedef TQQueue<TIdUrlPr> TIdUrlPrQ;
00096 typedef THash<TInt, PUrl> TIdToUrlH;
00097 
00099 // Url-Environment
00100 ClassTP(TUrlEnv, PUrlEnv)//{
00101 private:
00102   TStr BaseUrlStr;
00103   TStrV KeyNmV;
00104   TStrStrVH KeyNmToValH;
00105 public:
00106   TUrlEnv():
00107     KeyNmV(), KeyNmToValH(10){}
00108   TUrlEnv(const TUrlEnv& UrlEnv):
00109     KeyNmV(UrlEnv.KeyNmV), KeyNmToValH(UrlEnv.KeyNmToValH){}
00110   static PUrlEnv New(){return new TUrlEnv();}
00111   static PUrlEnv New(const TStr& BaseUrlStr,
00112    const TStr& KeyNm1=TStr(), const TStr& ValStr1=TStr(),
00113    const TStr& KeyNm2=TStr(), const TStr& ValStr2=TStr(),
00114    const TStr& KeyNm3=TStr(), const TStr& ValStr3=TStr(),
00115    const TStr& KeyNm4=TStr(), const TStr& ValStr4=TStr()){
00116     PUrlEnv UrlEnv=New();
00117     UrlEnv->PutBaseUrlStr(BaseUrlStr);
00118     if (!KeyNm1.Empty()){UrlEnv->AddKeyVal(KeyNm1, ValStr1);}
00119     if (!KeyNm2.Empty()){UrlEnv->AddKeyVal(KeyNm2, ValStr2);}
00120     if (!KeyNm3.Empty()){UrlEnv->AddKeyVal(KeyNm3, ValStr3);}
00121     if (!KeyNm4.Empty()){UrlEnv->AddKeyVal(KeyNm4, ValStr4);}
00122     return UrlEnv;}
00123   ~TUrlEnv(){}
00124   TUrlEnv(TSIn& SIn): KeyNmV(SIn), KeyNmToValH(SIn){}
00125   static PUrlEnv Load(TSIn& SIn){return new TUrlEnv(SIn);}
00126   void Save(TSOut& SOut){KeyNmV.Save(SOut); KeyNmToValH.Save(SOut);}
00127 
00128   TUrlEnv& operator=(const TUrlEnv& Env){
00129     if (this!=&Env){KeyNmV=Env.KeyNmV; KeyNmToValH=Env.KeyNmToValH;}
00130     return *this;}
00131 
00132   // base url
00133   void PutBaseUrlStr(const TStr& _BaseUrlStr){BaseUrlStr=_BaseUrlStr;}
00134   TStr GetBaseUrlStr() const {return BaseUrlStr;}
00135 
00136   // adding key-value
00137   void AddKeyVal(const TStr& KeyNm, const TStr& ValStr){
00138     if (!IsKey(KeyNm)){KeyNmV.Add(KeyNm); KeyNmToValH.AddKey(KeyNm);}
00139     KeyNmToValH.GetDat(KeyNm).Clr();
00140     KeyNmToValH.GetDat(KeyNm).Add(ValStr);}
00141   void AddToKeyVal(const TStr& KeyNm, const TStr& ValStr){
00142     if (!IsKey(KeyNm)){KeyNmV.Add(KeyNm); KeyNmToValH.AddKey(KeyNm);}
00143     KeyNmToValH.GetDat(KeyNm).Add(ValStr);}
00144 
00145   // key retrieval
00146   bool Empty() const {return KeyNmV.Empty();}
00147   int GetKeys() const {return KeyNmV.Len();}
00148   bool IsKey(const TStr& KeyNm) const {return KeyNmV.SearchForw(KeyNm)!=-1;}
00149   int GetKeyN(const TStr& KeyNm) const {return KeyNmV.SearchForw(KeyNm);}
00150   TStr GetKeyNm(const int& KeyN) const {return KeyNmV[KeyN];}
00151 
00152   // value retrieval
00153   int GetVals(const int& KeyN) const {
00154     return KeyNmToValH.GetDat(KeyNmV[KeyN]).Len();}
00155   int GetVals(const TStr& KeyNm) const {
00156     return KeyNmToValH.GetDat(KeyNm).Len();}
00157   TStr GetVal(const int& KeyN, const int& ValN=0) const {
00158     return KeyNmToValH.GetDat(KeyNmV[KeyN])[ValN];}
00159   TStr GetVal(const TStr& KeyNm, const int& ValN=0, const TStr& DfVal="") const {
00160     if (KeyNmToValH.IsKey(KeyNm)){
00161       return KeyNmToValH.GetDat(KeyNm)[ValN];}
00162     else {return DfVal;}}
00163 
00164   // full-url-string
00165   TStr GetFullUrlStr() const;
00166 
00167   static PUrlEnv MkClone(const PUrlEnv& UrlEnv);
00168 };
00169