SNAP Library 2.1, User Reference  2013-09-25 10:47:25
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
ss.cpp
Go to the documentation of this file.
00001 //#//////////////////////////////////////////////
00002 // Spread-Sheet
00003 TStr& TSs::At(const int& X, const int& Y){
00004 //  Fail;
00005   if (Y>=CellStrVV.Len()){CellStrVV.Reserve(Y+1, Y+1);}
00006   if (X>=CellStrVV[Y]->Len()){CellStrVV[Y]->V.Reserve(X+1, X+1);}
00007   return CellStrVV[Y]->V[X];
00008 }
00009 
00010 void TSs::PutVal(const int& X, const int& Y, const TStr& Str){
00011   if (Y>=CellStrVV.Len()){CellStrVV.Reserve(Y+1, Y+1);}
00012   if (X>=CellStrVV[Y]->Len()){CellStrVV[Y]->V.Reserve(X+1, X+1);}
00013   CellStrVV[Y]->V[X]=Str;
00014 }
00015 
00016 TStr TSs::GetVal(const int& X, const int& Y) const {
00017   if ((0<=Y)&&(Y<CellStrVV.Len())){
00018     if ((0<=X)&&(X<CellStrVV[Y]->Len())){
00019       return CellStrVV[Y]->V[X];
00020     } else {
00021       return TStr::GetNullStr();
00022     }
00023   } else {
00024     return TStr::GetNullStr();
00025   }
00026 }
00027 
00028 int TSs::GetXLen() const {
00029   if (CellStrVV.Len()==0){
00030     return 0;
00031   } else {
00032     int MxXLen=CellStrVV[0]->Len();
00033     for (int Y=1; Y<CellStrVV.Len(); Y++){
00034       MxXLen=TInt::GetMx(MxXLen, CellStrVV[Y]->Len());}
00035     return MxXLen;
00036   }
00037 }
00038 
00039 int TSs::GetXLen(const int& Y) const {
00040   if ((0<=Y)&&(Y<CellStrVV.Len())){
00041     return CellStrVV[Y]->Len();
00042   } else {
00043     return 0;
00044   }
00045 }
00046 
00047 int TSs::GetYLen() const {
00048   return CellStrVV.Len();
00049 }
00050 
00051 int TSs::SearchX(const int& Y, const TStr& Str) const {
00052   return CellStrVV[Y]->V.SearchForw(Str);
00053 }
00054 
00055 int TSs::SearchY(const int& X, const TStr& Str) const {
00056   int YLen=GetYLen();
00057   for (int Y=0; Y<YLen; Y++){
00058      if (Str==GetVal(X, Y)){return Y;}}
00059   return -1;
00060 }
00061 
00062 void TSs::DelX(const int& X){
00063   int YLen=GetYLen();
00064   for (int Y=0; Y<YLen; Y++){
00065     CellStrVV[Y]->V.Del(X);
00066   }
00067 }
00068 
00069 void TSs::DelY(const int& Y){
00070   CellStrVV.Del(Y);
00071 }
00072 
00073 int TSs::GetFldX(const TStr& FldNm, const TStr& NewFldNm, const int& Y) const {
00074   if (GetYLen()>Y){
00075     int XLen=GetXLen(Y);
00076     for (int X=0; X<XLen; X++){
00077       if (GetVal(X, Y).GetTrunc()==FldNm){
00078         if (!NewFldNm.Empty()){GetVal(X, Y)=NewFldNm;}
00079         return X;
00080       }
00081     }
00082     return -1;
00083   } else {
00084     return -1;
00085   }
00086 }
00087 
00088 int TSs::GetFldY(const TStr& FldNm, const TStr& NewFldNm, const int& X) const {
00089   for (int Y=0; Y<GetYLen(); Y++){
00090     if (GetXLen(Y)>X){
00091       if (GetVal(X, Y).GetTrunc()==FldNm){
00092         if (!NewFldNm.Empty()){GetVal(X, Y)=NewFldNm;}
00093         return Y;
00094       }
00095     }
00096   }
00097   return -1;
00098 }
00099 
00100 PSs TSs::LoadTxt(
00101  const TSsFmt& SsFmt, const TStr& FNm,
00102  const PNotify& Notify, const bool& IsExcelEoln,
00103  const int& MxY, const TIntV& AllowedColNV, const bool& IsQStr){
00104   TNotify::OnNotify(Notify, ntInfo, TStr("Loading File ")+FNm+" ...");
00105   PSIn SIn=TFIn::New(FNm);
00106   PSs Ss=TSs::New();
00107   if (!SIn->Eof()){
00108     int X=0; int Y=0; int PrevX=-1; int PrevY=-1;
00109     char Ch=SIn->GetCh(); TChA ChA;
00110     while (!SIn->Eof()){
00111       // compose value
00112       ChA.Clr();
00113       if (IsQStr&&(Ch=='"')){
00114         // quoted string ('""' sequence means '"')
00115         Ch=SIn->GetCh();
00116         forever {
00117           while ((!SIn->Eof())&&(Ch!='"')){
00118             ChA+=Ch; Ch=SIn->GetCh();}
00119           if (Ch=='"'){
00120             Ch=SIn->GetCh();
00121             if (Ch=='"'){ChA+=Ch; Ch=SIn->GetCh();}
00122             else {break;}
00123           }
00124         }
00125       } else {
00126         if (SsFmt==ssfTabSep){
00127           while ((!SIn->Eof())&&(Ch!='\t')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00128             ChA+=Ch; Ch=SIn->GetCh();
00129           }
00130         } else
00131         if (SsFmt==ssfCommaSep){
00132           while ((!SIn->Eof())&&(Ch!=',')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00133             ChA+=Ch; Ch=SIn->GetCh();
00134           }
00135         } else
00136         if (SsFmt==ssfSemicolonSep){
00137           while ((!SIn->Eof())&&(Ch!=';')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00138             ChA+=Ch; Ch=SIn->GetCh();
00139           }
00140         } else
00141         if (SsFmt==ssfVBar){
00142           while ((!SIn->Eof())&&(Ch!='|')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00143             ChA+=Ch; Ch=SIn->GetCh();
00144           }
00145         } else
00146         if (SsFmt==ssfSpaceSep){
00147           while ((!SIn->Eof())&&(Ch!=' ')&&(Ch!='\r')&&((Ch!='\n')||IsExcelEoln)){
00148             ChA+=Ch; Ch=SIn->GetCh();
00149           }
00150         } else {
00151           Fail;
00152         }
00153       }
00154       // add new line if neccessary
00155       if (PrevY!=Y){
00156         if ((MxY!=-1)&&(Ss->CellStrVV.Len()==MxY)){break;}
00157         Ss->CellStrVV.Add(TStrVP::New()); PrevY=Y;
00158         int Recs=Ss->CellStrVV.Len();
00159         if (Recs%1000==0){
00160           TNotify::OnStatus(Notify, TStr::Fmt("  %d\r", Recs));}
00161       }
00162       // add value to spreadsheet
00163       if (AllowedColNV.Empty()||AllowedColNV.IsIn(X)){
00164         Ss->CellStrVV[Y]->V.Add(ChA); 
00165       }
00166       // process delimiters
00167       if (SIn->Eof()){
00168         break;
00169       } else
00170       if ((SsFmt==ssfTabSep)&&(Ch=='\t')){
00171         X++; Ch=SIn->GetCh();
00172       } else
00173       if ((SsFmt==ssfCommaSep)&&(Ch==',')){
00174         X++; Ch=SIn->GetCh();
00175       } else
00176       if ((SsFmt==ssfSemicolonSep)&&(Ch==';')){
00177         X++; Ch=SIn->GetCh();
00178       } else
00179       if ((SsFmt==ssfVBar)&&(Ch=='|')){
00180         X++; Ch=SIn->GetCh();
00181       } else
00182       if ((SsFmt==ssfSpaceSep)&&(Ch==' ')){
00183         X++; Ch=SIn->GetCh();
00184       } else
00185       if (Ch=='\r'){
00186         if ((PrevX!=-1)&&(X!=PrevX)){
00187           TNotify::OnNotify(Notify, ntWarn, "Number of fields is not the same!");}
00188         PrevX=X; X=0; Y++; Ch=SIn->GetCh();
00189         if ((Ch=='\n')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00190         //if (Ss->CellStrVV.Len()%1000==0){Y--; break;}
00191       } else
00192       if (Ch=='\n'){
00193         if ((PrevX!=-1)&&(X!=PrevX)){
00194           TNotify::OnNotify(Notify, ntWarn, "Number of fields is not the same!");}
00195         PrevX=X; X=0; Y++; Ch=SIn->GetCh();
00196         if ((Ch=='\r')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00197         //if (Ss->CellStrVV.Len()%1000==0){Y--; break;}
00198       } else {
00199         Fail;
00200       }
00201     }
00202   }
00203   int Recs=Ss->CellStrVV.Len();
00204   TNotify::OnNotify(Notify, ntInfo, TStr::Fmt("  %d records read.", Recs));
00205   TNotify::OnNotify(Notify, ntInfo, "... Done.");
00206   return Ss;
00207 }
00208 
00209 void TSs::SaveTxt(const TStr& FNm, const PNotify&) const {
00210   PSOut SOut=TFOut::New(FNm);
00211   for (int Y=0; Y<CellStrVV.Len(); Y++){
00212     for (int X=0; X<CellStrVV[Y]->Len(); X++){
00213       if (X>0){SOut->PutCh('\t');}
00214       TStr Str=CellStrVV[Y]->V[X];
00215       TChA ChA(Str);
00216       for (int ChN=0; ChN<ChA.Len(); ChN++){
00217         char Ch=ChA[ChN];
00218         if ((Ch=='\t')||(Ch=='\r')||(Ch=='\n')){
00219           ChA.PutCh(ChN, ' ');
00220         }
00221       }
00222       SOut->PutStr(ChA);
00223     }
00224     SOut->PutCh('\r'); SOut->PutCh('\n');
00225   }
00226 }
00227 
00228 void TSs::LoadTxtFldV(
00229  const TSsFmt& SsFmt, const PSIn& SIn, char& Ch,
00230  TStrV& FldValV, const bool& IsExcelEoln, const bool& IsQStr){
00231   if (!SIn->Eof()){
00232     FldValV.Clr(false); int X=0;
00233     if (Ch==TCh::NullCh){Ch=SIn->GetCh();}
00234     TChA ChA;
00235     while (!SIn->Eof()){
00236       // compose value
00237       ChA.Clr();
00238       if (IsQStr&&(Ch=='"')){
00239         // quoted string ('""' sequence means '"')
00240         Ch=SIn->GetCh();
00241         forever {
00242           while ((!SIn->Eof())&&(Ch!='"')){
00243             ChA+=Ch; Ch=SIn->GetCh();}
00244           if (Ch=='"'){
00245             Ch=SIn->GetCh();
00246             if (Ch=='"'){ChA+=Ch; Ch=SIn->GetCh();}
00247             else {break;}
00248           }
00249         }
00250       } else {
00251         if (SsFmt==ssfTabSep){
00252           while ((!SIn->Eof())&&(Ch!='\t')&&(Ch!='\r')&&
00253            ((Ch!='\n')||IsExcelEoln)){
00254             ChA+=Ch; Ch=SIn->GetCh();
00255           }
00256           if ((!ChA.Empty())&&(ChA.LastCh()=='\"')){
00257             ChA.Pop();}
00258         } else
00259         if (SsFmt==ssfCommaSep){
00260           while ((!SIn->Eof())&&(Ch!=',')&&(Ch!='\r')&&
00261            ((Ch!='\n')||IsExcelEoln)){
00262             ChA+=Ch; Ch=SIn->GetCh();
00263           }
00264         } else
00265         if (SsFmt==ssfSemicolonSep){
00266           while ((!SIn->Eof())&&(Ch!=';')&&(Ch!='\r')&&
00267            ((Ch!='\n')||IsExcelEoln)){
00268             ChA+=Ch; Ch=SIn->GetCh();
00269           }
00270         } else
00271         if (SsFmt==ssfVBar){
00272           while ((!SIn->Eof())&&(Ch!='|')&&(Ch!='\r')&&
00273            ((Ch!='\n')||IsExcelEoln)){
00274             ChA+=Ch; Ch=SIn->GetCh();
00275           }
00276         } else {
00277           Fail;
00278         }
00279       }
00280       // add value to spreadsheet
00281       ChA.Trunc();
00282       FldValV.Add(ChA);
00283       // process delimiters
00284       if (SIn->Eof()){
00285         break;
00286       } else
00287       if ((SsFmt==ssfTabSep)&&(Ch=='\t')){
00288         X++; Ch=SIn->GetCh();
00289       } else
00290       if ((SsFmt==ssfCommaSep)&&(Ch==',')){
00291         X++; Ch=SIn->GetCh();
00292       } else
00293       if ((SsFmt==ssfSemicolonSep)&&(Ch==';')){
00294         X++; Ch=SIn->GetCh();
00295       } else
00296       if ((SsFmt==ssfVBar)&&(Ch=='|')){
00297         X++; Ch=SIn->GetCh();
00298       } else
00299       if (Ch=='\r'){
00300         Ch=SIn->GetCh();
00301         if ((Ch=='\n')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00302         break;
00303       } else
00304       if (Ch=='\n'){
00305         X=0; Ch=SIn->GetCh();
00306         if ((Ch=='\r')&&(!SIn->Eof())){Ch=SIn->GetCh();}
00307         break;
00308       } else {
00309         Fail;
00310       }
00311     }
00312   }
00313 }
00314 
00315 TSsFmt TSs::GetSsFmtFromStr(const TStr& SsFmtNm){
00316   TStr LcSsFmtNm=SsFmtNm.GetLc();
00317   if (LcSsFmtNm=="tab"){return ssfTabSep;}
00318   else if (LcSsFmtNm=="comma"){return ssfCommaSep;}
00319   else if (LcSsFmtNm=="semicolon"){return ssfSemicolonSep;}
00320   else if (LcSsFmtNm=="vbar"){return ssfVBar;}
00321   else if (LcSsFmtNm=="space"){return ssfSpaceSep;}
00322   else if (LcSsFmtNm=="white"){return ssfWhiteSep;}
00323   else {return ssfUndef;}
00324 }
00325 
00326 TStr TSs::GetStrFromSsFmt(const TSsFmt& SsFmt){
00327   switch (SsFmt){
00328     case ssfTabSep: return "tab";
00329     case ssfCommaSep: return "comma";
00330     case ssfSemicolonSep: return "semicolon";
00331     case ssfVBar: return "vbar";
00332     case ssfSpaceSep: return "space";
00333     case ssfWhiteSep: return "white";
00334     default: return "undef";
00335   }
00336 }
00337 
00338 TStr TSs::GetSsFmtNmVStr(){
00339   TChA ChA;
00340   ChA+='(';
00341   ChA+="tab"; ChA+=", ";
00342   ChA+="comma"; ChA+=", ";
00343   ChA+="semicolon"; ChA+=", ";
00344   ChA+="space"; ChA+=", ";
00345   ChA+="white"; ChA+=")";
00346   return ChA;
00347 }
00348 
00349 //#//////////////////////////////////////////////
00350 // Fast-Spread-Sheet-Parser
00351 TSsParser::TSsParser(const TStr& FNm, const TSsFmt _SsFmt, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(_SsFmt), 
00352  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), LineStr(), FldV(), FInPt(NULL) {
00353   if (TZipIn::IsZipExt(FNm.GetFExt())) { FInPt = TZipIn::New(FNm); }
00354   else { FInPt = TFIn::New(FNm); }
00355   //Bf = new char [BfLen];
00356   switch(SsFmt) {
00357     case ssfTabSep : SplitCh = '\t'; break;
00358     case ssfCommaSep : SplitCh = ','; break;
00359     case ssfSemicolonSep : SplitCh = ';'; break;
00360     case ssfVBar : SplitCh = '|'; break;
00361     case ssfSpaceSep : SplitCh = ' '; break;
00362     case ssfWhiteSep: SplitCh = ' '; break;
00363     default: FailR("Unknown separator character.");
00364   }
00365 }
00366 
00367 TSsParser::TSsParser(const TStr& FNm, const char& Separator, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(ssfSpaceSep), 
00368  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), LineStr(), FldV(), FInPt(NULL) {
00369   if (TZipIn::IsZipExt(FNm.GetFExt())) { FInPt = TZipIn::New(FNm); }
00370   else { FInPt = TFIn::New(FNm); }
00371   SplitCh = Separator;
00372 }
00373 
00374 TSsParser::~TSsParser() {
00375   //if (Bf != NULL) { delete [] Bf; }
00376 }
00377 
00378 // Gets and parses the next line.
00379 // This version of Next() is older, slower, works with chars.
00380 // RS 01/22/13 obsolete, can be removed in the future
00381 
00382 bool TSsParser::NextSlow() { // split on SplitCh
00383   FldV.Clr(false);
00384   LineStr.Clr();
00385   FldV.Clr();
00386   LineCnt++;
00387   if (! FInPt->GetNextLn(LineStr)) { return false; }
00388   if (SkipCmt && !LineStr.Empty() && LineStr[0]=='#') { return NextSlow(); }
00389 
00390   char* cur = LineStr.CStr();
00391   if (SkipLeadBlanks) { // skip leading blanks
00392     while (*cur && TCh::IsWs(*cur)) { cur++; }
00393   }
00394   char *last = cur;
00395   while (*cur) {
00396     if (SsFmt == ssfWhiteSep) { while (*cur && ! TCh::IsWs(*cur)) { cur++; } } 
00397     else { while (*cur && *cur!=SplitCh) { cur++; } }
00398     if (*cur == 0) { break; }
00399     *cur = 0;  cur++;
00400     FldV.Add(last);  last = cur;
00401     if (SkipEmptyFld && strlen(FldV.Last())==0) { FldV.DelLast(); } // skip empty fields
00402   }
00403   FldV.Add(last);  // add last field
00404   if (SkipEmptyFld && FldV.Empty()) { return NextSlow(); } // skip empty lines
00405   return true; 
00406 }
00407 
00408 // Gets and parses the next line, quick version, works with buffers, not chars.
00409 
00410 bool TSsParser::Next() { // split on SplitCh
00411   FldV.Clr(false);
00412   LineStr.Clr();
00413   FldV.Clr();
00414   LineCnt++;
00415   if (! FInPt->GetNextLnBf(LineStr)) { return false; }
00416   if (SkipCmt && !LineStr.Empty() && LineStr[0]=='#') { return Next(); }
00417 
00418   char* cur = LineStr.CStr();
00419   if (SkipLeadBlanks) { // skip leading blanks
00420     while (*cur && TCh::IsWs(*cur)) { cur++; }
00421   }
00422   char *last = cur;
00423   while (*cur) {
00424     if (SsFmt == ssfWhiteSep) { while (*cur && ! TCh::IsWs(*cur)) { cur++; } } 
00425     else { while (*cur && *cur!=SplitCh) { cur++; } }
00426     if (*cur == 0) { break; }
00427     *cur = 0;  cur++;
00428     FldV.Add(last);  last = cur;
00429     if (SkipEmptyFld && strlen(FldV.Last())==0) { FldV.DelLast(); } // skip empty fields
00430   }
00431   FldV.Add(last);  // add last field
00432   if (SkipEmptyFld && FldV.Empty()) { return Next(); } // skip empty lines
00433   return true; 
00434 }
00435 
00436 void TSsParser::ToLc() {
00437   for (int f = 0; f < FldV.Len(); f++) {
00438     for (char *c = FldV[f]; *c; c++) {
00439       *c = tolower(*c); }
00440   }
00441 }
00442 
00443 bool TSsParser::GetInt(const int& FldN, int& Val) const {
00444   // parsing format {ws} [+/-] +{ddd}
00445   int _Val = -1;
00446   bool Minus=false;
00447   const char *c = GetFld(FldN);
00448   while (TCh::IsWs(*c)) { c++; }
00449   if (*c=='-') { Minus=true; c++; }
00450   if (! TCh::IsNum(*c)) { return false; }
00451   _Val = TCh::GetNum(*c);  c++;
00452   while (TCh::IsNum(*c)){ 
00453     _Val = 10 * _Val + TCh::GetNum(*c); 
00454     c++; 
00455   }
00456   if (Minus) { _Val = -_Val; }
00457   if (*c != 0) { return false; }
00458   Val = _Val;
00459   return true;
00460 }
00461 
00462 bool TSsParser::GetFlt(const int& FldN, double& Val) const {
00463   // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] +{d})
00464   const char *c = GetFld(FldN);
00465   while (TCh::IsWs(*c)) { c++; }
00466   if (*c=='+' || *c=='-') { c++; }
00467   if (! TCh::IsNum(*c) && *c!='.') { return false; }
00468   while (TCh::IsNum(*c)) { c++; }
00469   if (*c == '.') {
00470     c++;
00471     while (TCh::IsNum(*c)) { c++; }
00472   }
00473   if (*c=='e' || *c == 'E') {
00474     c++;
00475     if (*c == '+' || *c == '-' ) { c++; }
00476     if (! TCh::IsNum(*c)) { return false; }
00477     while (TCh::IsNum(*c)) { c++; }
00478   }
00479   if (*c != 0) { return false; }
00480   Val = atof(GetFld(FldN));
00481   return true;
00482 }
00483 
00484 const char* TSsParser::DumpStr() const {
00485   static TChA ChA(10*1024);
00486   ChA.Clr();
00487   for (int i = 0; i < FldV.Len(); i++) {
00488     ChA += TStr::Fmt("  %d: '%s'\n", i, FldV[i]);
00489   }
00490   return ChA.CStr();
00491 }
00492