SNAP Library, Developer Reference  2012-10-02 12:56:23
SNAP, a general purpose network analysis and graph mining library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
ss.cpp
Go to the documentation of this file.
00001 
00002 // Spread-Sheet
00003 TStr& TSs::At(const int& X, const int& Y){
00004 //  Fail;
00005   if (Y>=CellStrVV.Len()){CellStrVV.Reserve(Y+1, Y+1);}
00006   if (X>=CellStrVV[Y]->Len()){CellStrVV[Y]->V.Reserve(X+1, X+1);}
00007   return CellStrVV[Y]->V[X];
00008 }
00009 
00010 void TSs::PutVal(const int& X, const int& Y, const TStr& Str){
00011   if (Y>=CellStrVV.Len()){CellStrVV.Reserve(Y+1, Y+1);}
00012   if (X>=CellStrVV[Y]->Len()){CellStrVV[Y]->V.Reserve(X+1, X+1);}
00013   CellStrVV[Y]->V[X]=Str;
00014 }
00015 
00016 TStr TSs::GetVal(const int& X, const int& Y) const {
00017   if ((0<=Y)&&(Y<CellStrVV.Len())){
00018     if ((0<=X)&&(X<CellStrVV[Y]->Len())){
00019       return CellStrVV[Y]->V[X];
00020     } else {
00021       return TStr::GetNullStr();
00022     }
00023   } else {
00024     return TStr::GetNullStr();
00025   }
00026 }
00027 
00028 int TSs::GetXLen() const {
00029   if (CellStrVV.Len()==0){
00030     return 0;
00031   } else {
00032     int MxXLen=CellStrVV[0]->Len();
00033     for (int Y=1; Y<CellStrVV.Len(); Y++){
00034       MxXLen=TInt::GetMx(MxXLen, CellStrVV[Y]->Len());}
00035     return MxXLen;
00036   }
00037 }
00038 
00039 int TSs::GetXLen(const int& Y) const {
00040   if ((0<=Y)&&(Y<CellStrVV.Len())){
00041     return CellStrVV[Y]->Len();
00042   } else {
00043     return 0;
00044   }
00045 }
00046 
00047 int TSs::GetYLen() const {
00048   return CellStrVV.Len();
00049 }
00050 
00051 int TSs::SearchX(const int& Y, const TStr& Str) const {
00052   return CellStrVV[Y]->V.SearchForw(Str);
00053 }
00054 
00055 int TSs::SearchY(const int& X, const TStr& Str) const {
00056   int YLen=GetYLen();
00057   for (int Y=0; Y<YLen; Y++){
00058      if (Str==GetVal(X, Y)){return Y;}}
00059   return -1;
00060 }
00061 
00062 void TSs::DelX(const int& X){
00063   int YLen=GetYLen();
00064   for (int Y=0; Y<YLen; Y++){
00065     CellStrVV[Y]->V.Del(X);
00066   }
00067 }
00068 
00069 void TSs::DelY(const int& Y){
00070   CellStrVV.Del(Y);
00071 }
00072 
00073 int TSs::GetFldX(const TStr& FldNm, const TStr& NewFldNm, const int& Y) const {
00074   if (GetYLen()>Y){
00075     int XLen=GetXLen(Y);
00076     for (int X=0; X<XLen; X++){
00077       if (GetVal(X, Y).GetTrunc()==FldNm){
00078         if (!NewFldNm.Empty()){GetVal(X, Y)=NewFldNm;}
00079         return X;
00080       }
00081     }
00082     return -1;
00083   } else {
00084     return -1;
00085   }
00086 }
00087 
00088 int TSs::GetFldY(const TStr& FldNm, const TStr& NewFldNm, const int& X) const {
00089   for (int Y=0; Y<GetYLen(); Y++){
00090     if (GetXLen(Y)>X){
00091       if (GetVal(X, Y).GetTrunc()==FldNm){
00092         if (!NewFldNm.Empty()){GetVal(X, Y)=NewFldNm;}
00093         return Y;
00094       }
00095     }
00096   }
00097   return -1;
00098 }
00099 
00100 PSs TSs::LoadTxt(
00101  const TSsFmt& SsFmt, const TStr& FNm,
00102  const PNotify& Notify, const bool& IsExcelEoln,
00103  const int& MxY, const TIntV& AllowedColNV, const bool& IsQStr){
00104   TNotify::OnNotify(Notify, ntInfo, TStr("Loading File ")+FNm+" ...");
00105   PSIn SIn=TFIn::New(FNm);
00106   PSs Ss=TSs::New();
00107   if (!SIn->Eof()){
00108     int X=0; int Y=0; int PrevX=-1; int PrevY=-1;
00109     char Ch=SIn->GetCh(); TChA ChA;
00110     while (!SIn->Eof()){
00111       // compose value
00112       ChA.Clr();
00113       if (IsQStr&&(Ch=='"')){
00114         // quoted string ('""' sequence means '"')
00115         Ch=SIn->GetCh();
00116         forever {
00117           while ((!SIn->Eof())&&(Ch!='"')){
00118             ChA+=Ch; Ch=SIn->GetCh();}
00119           if (Ch=='"'){
00120             Ch=SIn->GetCh();
00121             if (Ch=='"'){ChA+=Ch; Ch=SIn->GetCh();}
00122             else {break;}
00123           }
00124         }
00125       } else {
00126         if (SsFmt==ssfTabSep){
00127           while ((!SIn->Eof())&&(Ch!='\t')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00128             ChA+=Ch; Ch=SIn->GetCh();
00129           }
00130         } else
00131         if (SsFmt==ssfCommaSep){
00132           while ((!SIn->Eof())&&(Ch!=',')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00133             ChA+=Ch; Ch=SIn->GetCh();
00134           }
00135         } else
00136         if (SsFmt==ssfSemicolonSep){
00137           while ((!SIn->Eof())&&(Ch!=';')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00138             ChA+=Ch; Ch=SIn->GetCh();
00139           }
00140         } else
00141         if (SsFmt==ssfVBar){
00142           while ((!SIn->Eof())&&(Ch!='|')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00143             ChA+=Ch; Ch=SIn->GetCh();
00144           }
00145         } else
00146         if (SsFmt==ssfSpaceSep){
00147           while ((!SIn->Eof())&&(Ch!=' ')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00148             ChA+=Ch; Ch=SIn->GetCh();
00149           }
00150         } else {
00151           Fail;
00152         }
00153       }
00154       // add new line if neccessary
00155       if (PrevY!=Y){
00156         if ((MxY!=-1)&&(Ss->CellStrVV.Len()==MxY)){break;}
00157         Ss->CellStrVV.Add(TStrVP::New()); PrevY=Y;
00158         int Recs=Ss->CellStrVV.Len();
00159         if (Recs%1000==0){
00160           TNotify::OnStatus(Notify, TStr::Fmt("  %d\r", Recs));}
00161       }
00162       // add value to spreadsheet
00163       if (AllowedColNV.Empty()||AllowedColNV.IsIn(X)){
00164         Ss->CellStrVV[Y]->V.Add(ChA); 
00165       }
00166       // process delimiters
00167       if (SIn->Eof()){
00168         break;
00169       } else
00170       if ((SsFmt==ssfTabSep)&&(Ch=='\t')){
00171         X++; Ch=SIn->GetCh();
00172       } else
00173       if ((SsFmt==ssfCommaSep)&&(Ch==',')){
00174         X++; Ch=SIn->GetCh();
00175       } else
00176       if ((SsFmt==ssfSemicolonSep)&&(Ch==';')){
00177         X++; Ch=SIn->GetCh();
00178       } else
00179       if ((SsFmt==ssfVBar)&&(Ch=='|')){
00180         X++; Ch=SIn->GetCh();
00181       } else
00182       if ((SsFmt==ssfSpaceSep)&&(Ch==' ')){
00183         X++; Ch=SIn->GetCh();
00184       } else
00185       if (Ch=='\r'){
00186         if ((PrevX!=-1)&&(X!=PrevX)){
00187           TNotify::OnNotify(Notify, ntWarn, "Number of fields is not the same!");}
00188         PrevX=X; X=0; Y++; Ch=SIn->GetCh();
00189         if ((Ch=='\n')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00190         //if (Ss->CellStrVV.Len()%1000==0){Y--; break;}
00191       } else
00192       if (Ch=='\n'){
00193         if ((PrevX!=-1)&&(X!=PrevX)){
00194           TNotify::OnNotify(Notify, ntWarn, "Number of fields is not the same!");}
00195         PrevX=X; X=0; Y++; Ch=SIn->GetCh();
00196         if ((Ch=='\r')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00197         //if (Ss->CellStrVV.Len()%1000==0){Y--; break;}
00198       } else {
00199         Fail;
00200       }
00201     }
00202   }
00203   int Recs=Ss->CellStrVV.Len();
00204   TNotify::OnNotify(Notify, ntInfo, TStr::Fmt("  %d records read.", Recs));
00205   TNotify::OnNotify(Notify, ntInfo, "... Done.");
00206   return Ss;
00207 }
00208 
00209 void TSs::SaveTxt(const TStr& FNm, const PNotify&) const {
00210   PSOut SOut=TFOut::New(FNm);
00211   for (int Y=0; Y<CellStrVV.Len(); Y++){
00212     for (int X=0; X<CellStrVV[Y]->Len(); X++){
00213       if (X>0){SOut->PutCh('\t');}
00214       TStr Str=CellStrVV[Y]->V[X];
00215       TChA ChA(Str);
00216       for (int ChN=0; ChN<ChA.Len(); ChN++){
00217         char Ch=ChA[ChN];
00218         if ((Ch=='\t')||(Ch=='\r')||(Ch=='\n')){
00219           ChA.PutCh(ChN, ' ');
00220         }
00221       }
00222       SOut->PutStr(ChA);
00223     }
00224     SOut->PutCh('\r'); SOut->PutCh('\n');
00225   }
00226 }
00227 
00228 void TSs::LoadTxtFldV(
00229  const TSsFmt& SsFmt, const PSIn& SIn, char& Ch,
00230  TStrV& FldValV, const bool& IsExcelEoln, const bool& IsQStr){
00231   if (!SIn->Eof()){
00232     FldValV.Clr(false); int X=0;
00233     if (Ch==TCh::NullCh){Ch=SIn->GetCh();}
00234     TChA ChA;
00235     while (!SIn->Eof()){
00236       // compose value
00237       ChA.Clr();
00238       if (IsQStr&&(Ch=='"')){
00239         // quoted string ('""' sequence means '"')
00240         Ch=SIn->GetCh();
00241         forever {
00242           while ((!SIn->Eof())&&(Ch!='"')){
00243             ChA+=Ch; Ch=SIn->GetCh();}
00244           if (Ch=='"'){
00245             Ch=SIn->GetCh();
00246             if (Ch=='"'){ChA+=Ch; Ch=SIn->GetCh();}
00247             else {break;}
00248           }
00249         }
00250       } else {
00251         if (SsFmt==ssfTabSep){
00252           while ((!SIn->Eof())&&(Ch!='\t')&&(Ch!='\r')&&
00253            ((Ch!='\n')||IsExcelEoln)){
00254             ChA+=Ch; Ch=SIn->GetCh();
00255           }
00256           if ((!ChA.Empty())&&(ChA.LastCh()=='\"')){
00257             ChA.Pop();}
00258         } else
00259         if (SsFmt==ssfCommaSep){
00260           while ((!SIn->Eof())&&(Ch!=',')&&(Ch!='\r')&&
00261            ((Ch!='\n')||IsExcelEoln)){
00262             ChA+=Ch; Ch=SIn->GetCh();
00263           }
00264         } else
00265         if (SsFmt==ssfSemicolonSep){
00266           while ((!SIn->Eof())&&(Ch!=';')&&(Ch!='\r')&&
00267            ((Ch!='\n')||IsExcelEoln)){
00268             ChA+=Ch; Ch=SIn->GetCh();
00269           }
00270         } else
00271         if (SsFmt==ssfVBar){
00272           while ((!SIn->Eof())&&(Ch!='|')&&(Ch!='\r')&&
00273            ((Ch!='\n')||IsExcelEoln)){
00274             ChA+=Ch; Ch=SIn->GetCh();
00275           }
00276         } else {
00277           Fail;
00278         }
00279       }
00280       // add value to spreadsheet
00281       ChA.Trunc();
00282       FldValV.Add(ChA);
00283       // process delimiters
00284       if (SIn->Eof()){
00285         break;
00286       } else
00287       if ((SsFmt==ssfTabSep)&&(Ch=='\t')){
00288         X++; Ch=SIn->GetCh();
00289       } else
00290       if ((SsFmt==ssfCommaSep)&&(Ch==',')){
00291         X++; Ch=SIn->GetCh();
00292       } else
00293       if ((SsFmt==ssfSemicolonSep)&&(Ch==';')){
00294         X++; Ch=SIn->GetCh();
00295       } else
00296       if ((SsFmt==ssfVBar)&&(Ch=='|')){
00297         X++; Ch=SIn->GetCh();
00298       } else
00299       if (Ch=='\r'){
00300         Ch=SIn->GetCh();
00301         if ((Ch=='\n')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00302         break;
00303       } else
00304       if (Ch=='\n'){
00305         X=0; Ch=SIn->GetCh();
00306         if ((Ch=='\r')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00307         break;
00308       } else {
00309         Fail;
00310       }
00311     }
00312   }
00313 }
00314 
00315 TSsFmt TSs::GetSsFmtFromStr(const TStr& SsFmtNm){
00316   TStr LcSsFmtNm=SsFmtNm.GetLc();
00317   if (LcSsFmtNm=="tab"){return ssfTabSep;}
00318   else if (LcSsFmtNm=="comma"){return ssfCommaSep;}
00319   else if (LcSsFmtNm=="semicolon"){return ssfSemicolonSep;}
00320   else if (LcSsFmtNm=="vbar"){return ssfVBar;}
00321   else if (LcSsFmtNm=="space"){return ssfSpaceSep;}
00322   else if (LcSsFmtNm=="white"){return ssfWhiteSep;}
00323   else {return ssfUndef;}
00324 }
00325 
00326 TStr TSs::GetStrFromSsFmt(const TSsFmt& SsFmt){
00327   switch (SsFmt){
00328     case ssfTabSep: return "tab";
00329     case ssfCommaSep: return "comma";
00330     case ssfSemicolonSep: return "semicolon";
00331     case ssfVBar: return "vbar";
00332     case ssfSpaceSep: return "space";
00333     case ssfWhiteSep: return "white";
00334     default: return "undef";
00335   }
00336 }
00337 
00338 TStr TSs::GetSsFmtNmVStr(){
00339   TChA ChA;
00340   ChA+='(';
00341   ChA+="tab"; ChA+=", ";
00342   ChA+="comma"; ChA+=", ";
00343   ChA+="semicolon"; ChA+=", ";
00344   ChA+="space"; ChA+=", ";
00345   ChA+="white"; ChA+=")";
00346   return ChA;
00347 }
00348 
00350 // Fast-Spread-Sheet-Parser
00351 TSsParser::TSsParser(const TStr& FNm, const TSsFmt _SsFmt, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(_SsFmt), 
00352  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), FldV(), FInPt(NULL) {
00353   if (TZipIn::IsZipExt(FNm.GetFExt())) { FInPt = TZipIn::New(FNm); }
00354   else { FInPt = TFIn::New(FNm); }
00355   //Bf = new char [BfLen];
00356   switch(SsFmt) {
00357     case ssfTabSep : SplitCh = '\t'; break;
00358     case ssfCommaSep : SplitCh = ','; break;
00359     case ssfSemicolonSep : SplitCh = ';'; break;
00360     case ssfVBar : SplitCh = '|'; break;
00361     case ssfSpaceSep : SplitCh = ' '; break;
00362     case ssfWhiteSep: SplitCh = ' '; break;
00363     default: FailR("Unknown separator character.");
00364   }
00365 }
00366 
00367 TSsParser::TSsParser(const TStr& FNm, const char& Separator, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(ssfSpaceSep), 
00368  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), FldV(), FInPt(NULL) {
00369   if (TZipIn::IsZipExt(FNm.GetFExt())) { FInPt = TZipIn::New(FNm); }
00370   else { FInPt = TFIn::New(FNm); }
00371   SplitCh = Separator;
00372 }
00373 
00374 TSsParser::~TSsParser() {
00375   //if (Bf != NULL) { delete [] Bf; }
00376 }
00377 
00378 bool TSsParser::Next() { // split on SplitCh
00379   FldV.Clr(false);
00380   LineStr.Clr();
00381   FldV.Clr();
00382   LineCnt++;
00383   if (! FInPt->GetNextLn(LineStr)) { return false; }
00384   if (SkipCmt && LineStr.Len()>0 && LineStr[0]=='#') { return Next(); }
00385 
00386   char* cur = LineStr.CStr();
00387   if (SkipLeadBlanks) { // skip leadning blanks
00388     while (*cur && TCh::IsWs(*cur)) { cur++; }
00389   }
00390   char *last = cur;
00391   while (*cur) {
00392     if (SsFmt == ssfWhiteSep) { while (*cur && ! TCh::IsWs(*cur)) { cur++; } } 
00393     else { while (*cur && *cur!=SplitCh) { cur++; } }
00394     if (*cur == 0) { break; }
00395     *cur = 0;  cur++;
00396     FldV.Add(last);  last = cur;
00397     if (SkipEmptyFld && strlen(FldV.Last())==0) { FldV.DelLast(); } // skip empty fields
00398   }
00399   FldV.Add(last);  // add last field
00400   if (SkipEmptyFld && FldV.Empty()) { return Next(); } // skip empty lines
00401   return true; 
00402 }
00403 
00404 void TSsParser::ToLc() {
00405   for (int f = 0; f < FldV.Len(); f++) {
00406     for (char *c = FldV[f]; *c; c++) {
00407       *c = tolower(*c); }
00408   }
00409 }
00410 
00411 bool TSsParser::GetInt(const int& FldN, int& Val) const {
00412   // parsing format {ws} [+/-] +{ddd}
00413   int _Val = -1;
00414   bool Minus=false;
00415   const char *c = GetFld(FldN);
00416   while (TCh::IsWs(*c)) { c++; }
00417   if (*c=='-') { Minus=true; c++; }
00418   if (! TCh::IsNum(*c)) { return false; }
00419   _Val = TCh::GetNum(*c);  c++;
00420   while (TCh::IsNum(*c)){ 
00421     _Val = 10 * _Val + TCh::GetNum(*c); 
00422     c++; 
00423   }
00424   if (Minus) { _Val = -_Val; }
00425   if (*c != 0) { return false; }
00426   Val = _Val;
00427   return true;
00428 }
00429 
00430 bool TSsParser::GetFlt(const int& FldN, double& Val) const {
00431   // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] +{d})
00432   const char *c = GetFld(FldN);
00433   while (TCh::IsWs(*c)) { c++; }
00434   if (*c=='+' || *c=='-') { c++; }
00435   if (! TCh::IsNum(*c) && *c!='.') { return false; }
00436   while (TCh::IsNum(*c)) { c++; }
00437   if (*c == '.') {
00438     c++;
00439     while (TCh::IsNum(*c)) { c++; }
00440   }
00441   if (*c=='e' || *c == 'E') {
00442     c++;
00443     if (*c == '+' || *c == '-' ) { c++; }
00444     if (! TCh::IsNum(*c)) { return false; }
00445     while (TCh::IsNum(*c)) { c++; }
00446   }
00447   if (*c != 0) { return false; }
00448   Val = atof(GetFld(FldN));
00449   return true;
00450 }
00451 
00452 const char* TSsParser::DumpStr() const {
00453   static TChA ChA(10*1024);
00454   ChA.Clr();
00455   for (int i = 0; i < FldV.Len(); i++) {
00456     ChA += TStr::Fmt("  %d: '%s'\n", i, FldV[i]);
00457   }
00458   return ChA.CStr();
00459 }
00460