SNAP Library 4.0, Developer Reference  2017-07-27 13:18:06
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
ssmp.cpp
Go to the documentation of this file.
1 //#//////////////////////////////////////////////
2 
3 TSsParserMP::TSsParserMP(const TStr& FNm, const char& Separator, const bool& _SkipLeadBlanks, const bool& _SkipCmt, const bool& _SkipEmptyFld) : SsFmt(ssfSpaceSep),
4  SkipLeadBlanks(_SkipLeadBlanks), SkipCmt(_SkipCmt), SkipEmptyFld(_SkipEmptyFld), LineCnt(0), /*Bf(NULL),*/ SplitCh('\t'), LineStr(), FldV(), FInPt(NULL) {
5  FInPt = TMIn::New(FNm, true);
6  SplitCh = Separator;
7 }
8 
10 }
11 
14 }
15 
16 // Gets and parses the next line, quick version, works with buffers, not chars.
17 bool TSsParserMP::Next() { // split on SplitCh
18  FldV.Clr(false);
19  LineStr.Clr();
20  FldV.Clr();
21  LineCnt++;
22  if (! FInPt->GetNextLnBf(LineStr)) { return false; }
23  if (SkipCmt && !LineStr.Empty() && LineStr[0]=='#') { return Next(); }
24 
25  char* cur = LineStr.CStr();
26  if (SkipLeadBlanks) { // skip leading blanks
27  while (*cur && TCh::IsWs(*cur)) { cur++; }
28  }
29  char *last = cur;
30  while (*cur) {
31  if (SsFmt == ssfWhiteSep) { while (*cur && ! TCh::IsWs(*cur)) { cur++; } }
32  else { while (*cur && *cur!=SplitCh) { cur++; } }
33  if (*cur == 0) { break; }
34  *cur = 0; cur++;
35  FldV.Add(last); last = cur;
36  if (SkipEmptyFld && strlen(FldV.Last())==0) { FldV.DelLast(); } // skip empty fields
37  }
38  FldV.Add(last); // add last field
39  if (SkipEmptyFld && FldV.Empty()) { return Next(); } // skip empty lines
40  return true;
41 }
42 
44  for (int f = 0; f < FldV.Len(); f++) {
45  for (char *c = FldV[f]; *c; c++) {
46  *c = tolower(*c); }
47  }
48 }
49 
50 bool TSsParserMP::GetInt(const int& FldN, int& Val) const {
51  // parsing format {ws} [+/-] +{ddd}
52  int _Val = -1;
53  bool Minus=false;
54  const char *c = GetFld(FldN);
55  while (TCh::IsWs(*c)) { c++; }
56  if (*c=='-') { Minus=true; c++; }
57  if (! TCh::IsNum(*c)) { return false; }
58  _Val = TCh::GetNum(*c); c++;
59  while (TCh::IsNum(*c)){
60  _Val = 10 * _Val + TCh::GetNum(*c);
61  c++;
62  }
63  if (Minus) { _Val = -_Val; }
64  if (*c != 0) { return false; }
65  Val = _Val;
66  return true;
67 }
68 
69 bool TSsParserMP::GetFlt(const int& FldN, double& Val) const {
70  // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] +{d})
71  const char *c = GetFld(FldN);
72  while (TCh::IsWs(*c)) { c++; }
73  if (*c=='+' || *c=='-') { c++; }
74  if (! TCh::IsNum(*c) && *c!='.') { return false; }
75  while (TCh::IsNum(*c)) { c++; }
76  if (*c == '.') {
77  c++;
78  while (TCh::IsNum(*c)) { c++; }
79  }
80  if (*c=='e' || *c == 'E') {
81  c++;
82  if (*c == '+' || *c == '-' ) { c++; }
83  if (! TCh::IsNum(*c)) { return false; }
84  while (TCh::IsNum(*c)) { c++; }
85  }
86  if (*c != 0) { return false; }
87  Val = atof(GetFld(FldN));
88  return true;
89 }
90 
91 const char* TSsParserMP::DumpStr() const {
92  static TChA ChA(10*1024);
93  ChA.Clr();
94  for (int i = 0; i < FldV.Len(); i++) {
95  ChA += TStr::Fmt(" %d: '%s'\n", i, FldV[i]);
96  }
97  return ChA.CStr();
98 }
99 
100 // Finds number of new line chars in interval [lb, ub)
101 // Assumes that lines end in '\n'
103  return FInPt->CountNewLinesInRange(Lb, Ub);
104 }
105 
107  TVec<uint64> Ret;
108  if (Lb >= GetStreamLen()) {
109  return Ret;
110  }
111  while (Lb < Ub) {
112  // Find line corresponding to Lb
113  uint64 StartPos = FInPt->GetLineStartPos(Lb);
114  uint64 EndPos = FInPt->GetLineEndPos(Lb);
115 
116  // If line ends in given range, add to count
117  if (Lb <= EndPos && EndPos < Ub) {
118  Ret.Add(StartPos);
119  }
120  // Start at next line
121  Lb = EndPos + 1;
122  }
123  return Ret;
124 }
125 
126 // Essesntially the same as TssParser::Next
127 // For parallel load, FldV cannot be shared across many threads
129 {
130  // split on SplitCh
131  FieldsV.Clr();
132 
133  char* cur = FInPt->GetLine(Index);
134 
135  if (SkipLeadBlanks) { // skip leading blanks
136  while (*cur && TCh::IsWs(*cur)) { cur++; }
137  }
138  char *last = cur;
139  while (*cur != 0 && *cur != '\n') {
140  if (SsFmt == ssfWhiteSep) { while (*cur && (*cur != '\n') && ! TCh::IsWs(*cur)) { cur++; } }
141  else { while (*cur && *cur!=SplitCh && (*cur != '\n')) { cur++; } }
142  if (*cur == 0) { break; }
143  if (*cur == '\n') { break; }
144  //*cur = 0;
145  cur++;
146  FieldsV.Add(last); last = cur;
147  if (SkipEmptyFld && strlen(FieldsV.Last())==0) { FieldsV.DelLast(); } // skip empty fields
148  }
149  FieldsV.Add(last); // add last field
150 }
151 
152 int TSsParserMP::GetIntFromFldV(TVec<char*>& FieldsV, const int& FldN) {
153  // parsing format {ws} [+/-] +{ddd}
154  int _Val = -1;
155  bool Minus=false;
156  const char *c = FieldsV[FldN];
157  while (TCh::IsWs(*c)) { c++; }
158  if (*c=='-') { Minus=true; c++; }
159  if (! TCh::IsNum(*c)) { return -1; }
160  _Val = TCh::GetNum(*c); c++;
161  while (TCh::IsNum(*c)){
162  _Val = 10 * _Val + TCh::GetNum(*c);
163  c++;
164  }
165  if (Minus) { _Val = -_Val; }
166  //if (*c != 0) { return -1; }
167  return _Val;
168 }
169 
170 double TSsParserMP::GetFltFromFldV(TVec<char*>& FieldsV, const int& FldN) {
171  // parsing format {ws} [+/-] +{d} ([.]{d}) ([E|e] [+/-] +{d})
172  const char *c = FieldsV[FldN];
173  while (TCh::IsWs(*c)) { c++; }
174  if (*c=='+' || *c=='-') { c++; }
175  if (! TCh::IsNum(*c) && *c!='.') { return -1; }
176  while (TCh::IsNum(*c)) { c++; }
177  if (*c == '.') {
178  c++;
179  while (TCh::IsNum(*c)) { c++; }
180  }
181  if (*c=='e' || *c == 'E') {
182  c++;
183  if (*c == '+' || *c == '-' ) { c++; }
184  if (! TCh::IsNum(*c)) { return -1; }
185  while (TCh::IsNum(*c)) { c++; }
186  }
187  if (*c != 0) { return -1; }
188  return atof(FieldsV[FldN]);
189 }
bool Next()
Loads next line from the input file.
Definition: ssmp.cpp:17
bool GetFlt(const int &FldN, double &Val) const
If the field FldN is a float its value is returned in Val and the function returns true...
Definition: ssmp.cpp:69
TVec< uint64 > GetStartPosV(uint64 Lb, uint64 Ub) const
Finds start positions of all lines ending somewhere in [Lb, Ub)
Definition: ssmp.cpp:106
const char * DumpStr() const
Definition: ssmp.cpp:91
static bool IsNum(const char &Ch)
Definition: dt.h:1064
TSsParserMP(const TStr &FNm, const TSsFmt _SsFmt=ssfTabSep, const bool &_SkipLeadBlanks=false, const bool &_SkipCmt=true, const bool &_SkipEmptyFld=false)
Constructor.
bool Empty() const
Definition: dt.h:260
void Clr()
Definition: dt.h:258
TChA LineStr
Current line.
Definition: ssmp.h:15
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:575
uint64 LineCnt
Number of processed lines so far.
Definition: ssmp.h:13
bool GetInt(const int &FldN, int &Val) const
If the field FldN is an integer its value is returned in Val and the function returns true...
Definition: ssmp.cpp:50
bool SkipLeadBlanks
Ignore leading whitespace characters in a line.
Definition: ssmp.h:10
bool SkipEmptyFld
Skip empty fields (i.e., multiple consecutive separators are considered as one).
Definition: ssmp.h:12
bool GetNextLnBf(TChA &LnChA)
Definition: fl.cpp:763
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:570
void SkipCommentLines()
Skips lines that begin with a comment character.
Definition: ssmp.cpp:12
void SkipCommentLines()
Move stream pointer along until a non commented line is found.
Definition: fl.cpp:814
static bool IsWs(const char &Ch)
Definition: dt.h:1060
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:1022
char * CStr()
Definition: dt.h:255
TPt< TMIn > FInPt
Pointer to the input file stream.
Definition: ssmp.h:17
unsigned long long uint64
Definition: bd.h:38
TVec< char * > FldV
Pointers to fields of the current line.
Definition: ssmp.h:16
const char * GetFld(const int &FldN) const
Returns the contents of the field at index FldN.
Definition: ssmp.h:66
static int GetNum(const char &Ch)
Definition: dt.h:1066
Whitespace (space or tab) separated.
Definition: ss.h:11
uint64 CountNewLinesInRange(uint64 Lb, uint64 Ub)
Finds number of new line chars in interval [Lb, Ub)
Definition: fl.cpp:782
const TVal & Last() const
Returns a reference to the last element of the vector.
Definition: ds.h:579
static PSIn New(const void *_Bf, const uint64 &_BfL, const bool &TakeBf=false)
Definition: fl.cpp:668
Space separated.
Definition: ss.h:10
void ToLc()
Transforms the current line to lower case.
Definition: ssmp.cpp:43
uint64 GetLineEndPos(uint64 Ind)
Finds end of line in which Ind is present.
Definition: fl.cpp:802
Definition: dt.h:201
uint64 GetLineStartPos(uint64 Ind)
Finds beginning of line in which Ind is present.
Definition: fl.cpp:795
uint64 CountNewLinesInRange(uint64 Lb, uint64 Ub) const
Counts number of occurences of ' ' in [Lb, Ub)
Definition: ssmp.cpp:102
int GetIntFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets integer at field FldN.
Definition: ssmp.cpp:152
void NextFromIndex(uint64 Index, TVec< char * > &FieldsV)
Loads next line starting from a given position.
Definition: ssmp.cpp:128
char * GetLine(uint64 Ind)
Definition: fl.cpp:810
Definition: dt.h:412
static TStr Fmt(const char *FmtStr,...)
Definition: dt.cpp:1599
bool SkipCmt
Skip comments (lines starting with #).
Definition: ssmp.h:11
char SplitCh
Separator character (if one of the non-started separators is used)
Definition: ssmp.h:14
~TSsParserMP()
Definition: ssmp.cpp:9
TSsFmt SsFmt
Separator type.
Definition: ssmp.h:9
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:602
void DelLast()
Removes the last element of the vector.
Definition: ds.h:665
uint64 GetStreamLen() const
Returns length of stream.
Definition: ssmp.h:93
Vector is a sequence TVal objects representing an array that can change in size.
Definition: ds.h:430
double GetFltFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets float at field FldN.
Definition: ssmp.cpp:170