SNAP Library 3.0, User Reference  2016-07-20 17:56:49
SNAP, a general purpose, high performance system for analysis and manipulation of large networks
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
table.cpp
Go to the documentation of this file.
2  if (Left != NULL) { Left->GetVariables(Variables); }
3  if (Right != NULL) { Right->GetVariables(Variables); }
4  if (Op == NOP) {
5  if (Atom.Lvar != "" ) { Variables.Add(Atom.Lvar); }
6  if (Atom.Rvar != "" ) { Variables.Add(Atom.Rvar); }
7  }
8 }
9 
10 void TPredicate::GetVariables(TStrV& Variables) {
11  Root->GetVariables(Variables);
12 }
13 
15  TPredicateNode* Curr = Root;
16  TPredicateNode* Prev = NULL;
17  while (!(Curr == NULL && Prev == Root)) {
18  // going down the tree
19  if (Prev == NULL || Prev == Curr->Parent) {
20  // left child exists and was not yet evaluated
21  if (Curr->Left != NULL) {
22  Prev = Curr;
23  Curr = Curr->Left;
24  } else if (Curr->Right != NULL) {
25  Prev = Curr;
26  Curr = Curr->Right;
27  } else {
28  Curr->Result = EvalAtomicPredicate(Curr->Atom);
29  Prev = Curr;
30  Curr = Curr->Parent;
31  }
32  } else if (Prev == Curr->Left) {
33  // going back up through left (first) child
34  switch (Curr->Op) {
35  case NOT: {
36  Assert(Curr->Right == NULL);
37  Curr->Result = !(Prev->Result);
38  Prev = Curr;
39  Curr = Curr->Parent;
40  break;
41  }
42  case AND: {
43  Assert(Curr->Right != NULL);
44  if (!Prev->Result) {
45  Curr->Result = false;
46  Prev = Curr;
47  Curr = Curr->Parent;
48  } else {
49  Prev = Curr;
50  Curr = Curr->Right;
51  }
52  break;
53  }
54  case OR: {
55  Assert(Curr->Right != NULL);
56  if (Prev->Result) {
57  Curr->Result = true;
58  Prev = Curr;
59  Curr = Curr->Parent;
60  } else {
61  Prev = Curr;
62  Curr = Curr->Right;
63  }
64  break;
65  }
66  case NOP: {
67  break;
68  }
69  }
70  } else {
71  // going back up the tree from right (second) child
72  Assert(Prev == Curr->Right);
73  switch (Curr->Op) {
74  case NOT: {
75  Assert(Curr->Left == NULL);
76  Curr->Result = !(Prev->Result);
77  break;
78  }
79  case AND: {
80  Assert(Curr->Left != NULL);
81  Assert(Curr->Left->Result);
82  Curr->Result = Prev->Result;
83  break;
84  }
85  case OR: {
86  Assert(Curr->Left != NULL);
87  Assert(!Curr->Left->Result);
88  Curr->Result = Prev->Result;
89  break;
90  }
91  case NOP: {
92  break;
93  }
94  }
95  Prev = Curr;
96  Curr = Curr->Parent;
97  }
98  }
99  return Root->Result;
100 }
101 
103  switch (Atom.Type) {
104  case atInt: {
105  if (Atom.IsConst) {
106  return EvalAtom<TInt>(IntVars.GetDat(Atom.Lvar), Atom.IntConst, Atom.Compare);
107  }
108  return EvalAtom<TInt>(IntVars.GetDat(Atom.Lvar), IntVars.GetDat(Atom.Rvar), Atom.Compare);
109  }
110  case atFlt: {
111  if (Atom.IsConst) {
112  return EvalAtom<TFlt>(FltVars.GetDat(Atom.Lvar), Atom.FltConst, Atom.Compare);
113  }
114  return EvalAtom<TFlt>(FltVars.GetDat(Atom.Lvar), FltVars.GetDat(Atom.Rvar), Atom.Compare);
115  }
116  case atStr: {
117  if (Atom.IsConst) {
118  return EvalAtom<TStr>(StrVars.GetDat(Atom.Lvar), Atom.StrConst, Atom.Compare);
119  }
120  return EvalAtom<TStr>(StrVars.GetDat(Atom.Lvar), StrVars.GetDat(Atom.Rvar), Atom.Compare);
121  }
122  }
123  return false;
124 }
125 
126 TInt const TTable::Last = -1;
127 TInt const TTable::Invalid = -2;
128 
129 TInt TTable::UseMP = 1;
130 
132  return this->Next();
133 }
134 
137  //Assert(CurrRowIdx != TTable::Invalid);
138  return *this;
139 }
140 
141 bool TRowIterator::operator < (const TRowIterator& RowI) const{
142  if (CurrRowIdx == TTable::Last) { return false; }
143  if (RowI.CurrRowIdx == TTable::Last) { return true; }
144  return CurrRowIdx < RowI.CurrRowIdx;
145 }
146 
147 bool TRowIterator::operator == (const TRowIterator& RowI) const {
148  return CurrRowIdx == RowI.CurrRowIdx;
149 }
150 
152  return CurrRowIdx;
153 }
154 // We do not check column type in the iterator.
156  return Table->IntCols[ColIdx][CurrRowIdx];
157 }
158 
160  return Table->FltCols[ColIdx][CurrRowIdx];
161 }
162 
164  return Table->GetStrVal(ColIdx, CurrRowIdx);
165 }
166 
167 TInt TRowIterator::GetIntAttr(const TStr& Col) const {
168  TInt ColIdx = Table->GetColIdx(Col);
169  return Table->IntCols[ColIdx][CurrRowIdx];
170 }
171 
172 TFlt TRowIterator::GetFltAttr(const TStr& Col) const {
173  TInt ColIdx = Table->GetColIdx(Col);
174  return Table->FltCols[ColIdx][CurrRowIdx];
175 }
176 
177 TStr TRowIterator::GetStrAttr(const TStr& Col) const {
178  return Table->GetStrVal(Col, CurrRowIdx);
179 }
180 
182  TInt ColIdx = Table->GetColIdx(Col);
183  return Table->StrColMaps[ColIdx][CurrRowIdx];
184 }
185 
187  return Table->StrColMaps[ColIdx][CurrRowIdx];
188 }
189 
191  TBool Result;
192  switch (Val.GetType()) {
193  case atInt:
194  Result = TPredicate::EvalAtom(GetIntAttr(ColIdx), Val.GetInt(), Cmp);
195  break;
196  case atFlt:
197  Result = TPredicate::EvalAtom(GetFltAttr(ColIdx), Val.GetFlt(), Cmp);
198  break;
199  case atStr:
200  Result = TPredicate::EvalStrAtom(GetStrAttr(ColIdx), Val.GetStr(), Cmp);
201  break;
202  default:
203  Result = TBool(false);
204  }
205  return Result;
206 }
207 
209  TBool Result;
210  //printf("string compare\n");
211  Result = TPredicate::EvalStrAtom(GetStrAttr(ColIdx), Val, Cmp);
212  return Result;
213 }
214 
216  CurrRowIdx(RowIdx), Table(TablePtr), Start(RowIdx == TablePtr->FirstValidRow) {}
217 
219  return this->Next();
220 }
221 
224  Start = false;
225  Assert(CurrRowIdx != TTable::Invalid);
226  return *this;
227 }
228 
230  if (CurrRowIdx == TTable::Last) { return false; }
231  if (RowI.CurrRowIdx == TTable::Last) { return true; }
232  return CurrRowIdx < RowI.CurrRowIdx;
233 }
234 
236  return CurrRowIdx == RowI.CurrRowIdx;
237 }
238 
240  return CurrRowIdx;
241 }
242 
244  return (Start ? Table->FirstValidRow : Table->Next[CurrRowIdx]);
245 }
246 
247 // We do not check column type in the iterator.
249  return Table->IntCols[ColIdx][GetNextRowIdx()];
250 }
251 
253  return Table->FltCols[ColIdx][GetNextRowIdx()];
254 }
255 
257  return Table->GetStrVal(ColIdx, GetNextRowIdx());
258 }
259 
261  TInt ColIdx = Table->GetColIdx(Col);
262  return Table->IntCols[ColIdx][GetNextRowIdx()];
263 }
264 
266  TInt ColIdx = Table->GetColIdx(Col);
267  return Table->FltCols[ColIdx][GetNextRowIdx()];
268 }
269 
271  return Table->GetStrVal(Col, GetNextRowIdx());
272 }
273 
275  return CurrRowIdx == Table->FirstValidRow;
276 }
277 
280 }
281 
283  TBool Result;
284  switch (Val.GetType()) {
285  case atInt:
286  Result = TPredicate::EvalAtom(GetNextIntAttr(ColIdx), Val.GetInt(), Cmp);
287  break;
288  case atFlt:
289  Result = TPredicate::EvalAtom(GetNextFltAttr(ColIdx), Val.GetFlt(), Cmp);
290  break;
291  case atStr:
292  Result = TPredicate::EvalStrAtom(GetNextStrAttr(ColIdx), Val.GetStr(), Cmp);
293  break;
294  default:
295  Result = TBool(false);
296  }
297  return Result;
298 }
299 
300 // Better not use default constructor as it leads to a memory leak.
301 // - OR - implement a destructor.
302 TTable::TTable(): Context(new TTableContext), NumRows(0), NumValidRows(0),
303  FirstValidRow(0), LastValidRow(-1) {}
304 
305 TTable::TTable(TTableContext* Context): Context(Context), NumRows(0),
306  NumValidRows(0), FirstValidRow(0), LastValidRow(-1) {}
307 
308 TTable::TTable(const Schema& TableSchema, TTableContext* Context): Context(Context),
309  NumRows(0), NumValidRows(0), FirstValidRow(0), LastValidRow(-1), IsNextDirty(0) {
310  TInt IntColCnt = 0;
311  TInt FltColCnt = 0;
312  TInt StrColCnt = 0;
313  for (TInt i = 0; i < TableSchema.Len(); i++) {
314  TStr ColName = TableSchema[i].Val1;
315  TAttrType ColType = TableSchema[i].Val2;
316  AddSchemaCol(ColName, ColType);
317  switch (ColType) {
318  case atInt:
319  AddColType(ColName, atInt, IntColCnt);
320  IntColCnt++;
321  break;
322  case atFlt:
323  AddColType(ColName, atFlt, FltColCnt);
324  FltColCnt++;
325  break;
326  case atStr:
327  AddColType(ColName, atStr, StrColCnt);
328  StrColCnt++;
329  break;
330  }
331  }
332  IntCols = TVec<TIntV>(IntColCnt);
333  FltCols = TVec<TFltV>(FltColCnt);
334  StrColMaps = TVec<TIntV>(StrColCnt);
335 }
336 
337 TTable::TTable(TSIn& SIn, TTableContext* Context): Context(Context), NumRows(SIn),
338  NumValidRows(SIn), FirstValidRow(SIn), LastValidRow(SIn), Next(SIn), IntCols(SIn),
339  FltCols(SIn), StrColMaps(SIn) {
340  THash<TStr,TPair<TInt,TInt> > ColTypeIntMap(SIn);
341 
342  ColTypeMap.Clr();
343  Sch.Clr();
344  for (THash<TStr,TPair<TInt,TInt> >::TIter it = ColTypeIntMap.BegI(); it < ColTypeIntMap.EndI(); it++) {
345  TPair<TInt,TInt> dat = it.GetDat();
346  switch (dat.GetVal1()) {
347  case 0:
348  AddColType(it.GetKey(), atInt, dat.GetVal2());
349  AddSchemaCol(it.GetKey(), atInt);
350  break;
351  case 1:
352  AddColType(it.GetKey(), atFlt, dat.GetVal2());
353  AddSchemaCol(it.GetKey(), atFlt);
354  break;
355  case 2:
356  AddColType(it.GetKey(), atStr, dat.GetVal2());
357  AddSchemaCol(it.GetKey(), atStr);
358  break;
359  }
360  }
361 
362  IsNextDirty = 0;
363 }
364 
365 TTable::TTable(const TIntIntH& H, const TStr& Col1, const TStr& Col2,
366  TTableContext* Context, const TBool IsStrKeys) : Context(Context), NumRows(H.Len()),
367  NumValidRows(H.Len()), FirstValidRow(0), LastValidRow(H.Len()-1) {
368  TAttrType KeyType = IsStrKeys ? atStr : atInt;
369  AddSchemaCol(Col1, KeyType);
370  AddSchemaCol(Col2, atInt);
371  AddColType(Col1, KeyType, 0);
372  AddColType(Col2, atInt, 1);
373  if (IsStrKeys) {
374  StrColMaps = TVec<TIntV>(1);
375  IntCols = TVec<TIntV>(1);
376  H.GetKeyV(StrColMaps[0]);
377  H.GetDatV(IntCols[0]);
378  } else {
379  IntCols = TVec<TIntV>(2);
380  H.GetKeyV(IntCols[0]);
381  H.GetDatV(IntCols[1]);
382  }
383  Next = TIntV(NumRows);
384  for (TInt i = 0; i < NumRows; i++) {
385  Next[i] = i+1;
386  }
387  Next[NumRows-1] = Last;
388  IsNextDirty = 0;
389  InitIds();
390 }
391 
392 TTable::TTable(const TIntFltH& H, const TStr& Col1, const TStr& Col2,
393  TTableContext* Context, const TBool IsStrKeys) : Context(Context),
394  NumRows(H.Len()), NumValidRows(H.Len()), FirstValidRow(0), LastValidRow(H.Len()-1) {
395  TAttrType KeyType = IsStrKeys ? atStr : atInt;
396  AddSchemaCol(Col1, KeyType);
397  AddSchemaCol(Col2, atFlt);
398  AddColType(Col1, KeyType, 0);
399  AddColType(Col2, atFlt, 0);
400  if (IsStrKeys) {
401  StrColMaps = TVec<TIntV>(1);
402  H.GetKeyV(StrColMaps[0]);
403  } else {
404  IntCols = TVec<TIntV>(1);
405  H.GetKeyV(IntCols[0]);
406  }
407  FltCols = TVec<TFltV>(1);
408  H.GetDatV(FltCols[0]);
409  Next = TIntV(NumRows);
410  for (TInt i = 0; i < NumRows; i++) {
411  Next[i] = i+1;
412  }
413  Next[NumRows-1] = Last;
414  IsNextDirty = 0;
415  InitIds();
416 }
417 
418 TTable::TTable(const TTable& Table, const TIntV& RowIDs) : Context(Table.Context),
419  Sch(Table.Sch), SrcCol(Table.SrcCol), DstCol(Table.DstCol), EdgeAttrV(Table.EdgeAttrV),
420  SrcNodeAttrV(Table.SrcNodeAttrV), DstNodeAttrV(Table.DstNodeAttrV),
421  CommonNodeAttrs(Table.CommonNodeAttrs) {
422  ColTypeMap = Table.ColTypeMap;
423  IntCols = TVec<TIntV>(Table.IntCols.Len());
424  FltCols = TVec<TFltV>(Table.FltCols.Len());
426  FirstValidRow = 0;
427  LastValidRow = -1;
428  NumRows = 0;
429  NumValidRows = 0;
430  AddSelectedRows(Table, RowIDs);
431  IsNextDirty = 0;
432  InitIds();
433 }
434 
435 void TTable::GetSchema(const TStr& InFNm, Schema& S, const char& Separator) {
436  // Determine Attr Type
437  // Assume that the data is tab separated
438  TSsParser Ss(InFNm, '\t', false, false, false);
439  TInt rowsToPeek = 1000;
440  TInt currRow = 0;
441  TInt lastComment = 0;
442  while (Ss.Next()) {
443  if (Ss.IsCmt()) {
444  lastComment += 1;
445  }
446  else break;
447  }
448  if (Ss.Eof()) {TExcept::Throw("No Data to determine attribute types!");}
449  TInt numCols = Ss.GetFlds();
450  TVec<TAttrType> colAttrV(numCols);
451  colAttrV.PutAll(atInt);
452  while (true) {
453  for (TInt i = 0; i < numCols; i++) {
454  if (Ss.IsInt(i)) {
455  }
456  else if (Ss.IsFlt(i)) {
457  colAttrV[i] = atFlt;
458  }
459  else {
460  colAttrV[i] = atStr;
461  }
462  }
463  currRow++;
464  if (currRow > rowsToPeek || Ss.Eof()) break;
465  Ss.Next();
466  }
467  // Default Separator is tab
468  TSsParser SsNames(InFNm, Separator, false, false, false);
469  for (int i = 0; i < lastComment; i++) { SsNames.Next();}
470  TVec<TStr> attrV;
471  TStr first(SsNames[0]);
472  int begin = 0;
473  TStr comment('#');
474  if (first != comment) {
475  for (int i = 1; i < first.Len(); i++){
476  if (first[i] != ' ') { begin = i; break;}
477  }
478  attrV.Add(first.GetSubStr(begin));
479  }
480  for (int i = 1; i < SsNames.GetFlds(); i++) {attrV.Add(SsNames[i]);}
481  for (TInt i = 0; i < numCols; i++) {
482  S.Add(TPair<TStr,TAttrType>(attrV[i],colAttrV[i]));
483  }
484 }
485 
486 #ifdef GCC_ATOMIC
487 void TTable::LoadSSPar(PTable& T, const Schema& S, const TStr& InFNm, const TIntV& RelevantCols,
488  const char& Separator, TBool HasTitleLine) {
489  // preloaded necessary variables
490  TInt RowLen = T->Sch.Len();
491  TVec<TAttrType> ColTypes = TVec<TAttrType>(RowLen);
492  for (TInt i = 0; i < RowLen; i++) {
493  ColTypes[i] = T->GetSchemaColType(i);
494  }
495 
496  TSsParserMP Ss(InFNm, Separator);
497  Ss.SkipCommentLines();
498 
499  // if title line (i.e. names of the columns) is included as first row in the
500  // input file - use it to validate schema
501  if (HasTitleLine) {
502  Ss.Next();
503  if (S.Len() != Ss.GetFlds()) {
504  printf("%s\n", Ss[0]); TExcept::Throw("Table Schema Mismatch!");
505  }
506  for (TInt i = 0; i < Ss.GetFlds(); i++) {
507  // remove carriage return char
508  TInt L = strlen(Ss[i]);
509  if (Ss[i][L-1] < ' ') { Ss[i][L-1] = 0; }
510  if (NormalizeColName(S[i].Val1) != NormalizeColName(Ss[i])) { TExcept::Throw("Table Schema Mismatch!"); }
511  }
512  }
513 
514  // Divide remaining part of stream into equal sized chunks
515  // Find starting position in stream for each thread
516  int64 Cnt = 0;
517  uint64 Pos = Ss.GetStreamPos();
518  uint64 Len = Ss.GetStreamLen();
519  uint64 Rem = Len - Pos;
520  int NumThreads = omp_get_max_threads();
521 
522  uint64 Delta = Rem / NumThreads;
523  if (Delta < 1) Delta = 1;
524 
525  TVec<uint64> StartIntV(NumThreads);
526  TVec<uint64> LineCountV(NumThreads);
527  TVec<uint64> PrefixSumV(NumThreads);
528 
529  StartIntV[0] = Pos;
530  for (int i = 1; i < NumThreads; i++) {
531  StartIntV[i] = StartIntV[i-1] + Delta;
532  }
533  StartIntV.Add(Len);
534 
535  // Find number of lines handled by each thread
536  omp_set_num_threads(NumThreads);
537  #pragma omp parallel for schedule(dynamic) reduction(+:Cnt)
538  for (int i = 0; i < NumThreads; i++) {
539  LineCountV[i] = Ss.CountNewLinesInRange(StartIntV[i], StartIntV[i+1]);
540  Cnt += LineCountV[i];
541  }
542 
543  // Calculate row index offsets for each thread
544  PrefixSumV[0] = 0;
545  for (int i = 1; i < NumThreads; i++) {
546  PrefixSumV[i] = PrefixSumV[i-1] + LineCountV[i-1];
547  }
548  Ss.SetStreamPos(Pos);
549 
550  // allocate memory for columns
551  TInt IntColIdx = 0;
552  TInt FltColIdx = 0;
553  for (TInt i = 0; i < RowLen; i++) {
554  switch (ColTypes[i]) {
555  case atInt:
556  T->IntCols[IntColIdx].Gen(Cnt);
557  IntColIdx++;
558  break;
559  case atFlt:
560  T->FltCols[FltColIdx].Gen(Cnt);
561  FltColIdx++;
562  break;
563  case atStr:
564  break;
565  }
566  }
567 
568  Cnt = 0;
569  omp_set_num_threads(NumThreads);
570  #pragma omp parallel for schedule(dynamic) reduction(+:Cnt)
571  for (int i = 0; i < NumThreads; i++) {
572  // calculate beginning of each line handled by thread
573  TVec<uint64> LineStartPosV = Ss.GetStartPosV(StartIntV[i], StartIntV[i+1]);
574 
575  // parse line and fill rows
576  for (uint64 k = 0; k < (uint64) LineStartPosV.Len(); k++) {
577  TVec<char*> FieldsV;
578  Ss.NextFromIndex(LineStartPosV[k], FieldsV);
579  if (FieldsV.Len() != S.Len()) {
580  TExcept::Throw("Error reading tsv file");
581  }
582  TInt IntColIdx = 0;
583  TInt FltColIdx = 0;
584  TInt RowIdx = PrefixSumV[i] + k;
585 
586  for (TInt j = 0; j < RowLen; j++) {
587  switch (ColTypes[j]) {
588  case atInt:
589  if (RelevantCols.Len() == 0) {
590  T->IntCols[IntColIdx][RowIdx] = \
591  (Ss.GetIntFromFldV(FieldsV, j));
592  } else {
593  T->IntCols[IntColIdx][RowIdx] = \
594  (Ss.GetIntFromFldV(FieldsV, RelevantCols[j]));
595  }
596  IntColIdx++;
597  break;
598  case atFlt:
599  if (RelevantCols.Len() == 0) {
600  T->FltCols[FltColIdx][RowIdx] = \
601  (Ss.GetFltFromFldV(FieldsV, j));
602  } else {
603  T->FltCols[FltColIdx][RowIdx] = \
604  (Ss.GetFltFromFldV(FieldsV, RelevantCols[j]));
605  }
606  FltColIdx++;
607  break;
608  case atStr:
609  TExcept::Throw("TTable::LoadSS:: Str Col found\n");
610  break;
611  }
612  }
613  Cnt++;
614  }
615  }
616 
617  // set number of rows and "Next" vector
618  T->NumRows = Cnt;
619  T->NumValidRows = T->NumRows;
620 
621  T->Next.Clr();
622  T->Next.Gen(Cnt);
623 
624  omp_set_num_threads(NumThreads);
625  #pragma omp parallel for schedule(dynamic, 10000)
626  for (int64 i = 0; i < Cnt-1; i++) {
627  T->Next[i] = i+1;
628  }
629  T->IsNextDirty = 0;
630  T->Next[Cnt-1] = Last;
631  T->LastValidRow = T->NumRows - 1;
632 
633  T->IdColName = "_id";
634  TInt IdCol = T->IntCols.Add();
635  T->IntCols[IdCol].Gen(Cnt);
636 
637  // initialize ID column
638  omp_set_num_threads(NumThreads);
639  #pragma omp parallel for schedule(dynamic, 10000)
640  for (int64 i = 0; i < Cnt; i++) {
641  T->IntCols[IdCol][i] = i;
642  }
643 
644  T->AddSchemaCol(T->IdColName, atInt);
645  T->AddColType(T->IdColName, atInt, T->IntCols.Len()-1);
646 }
647 #endif // GCC_ATOMIC
648 
650  PTable& T, const Schema& S, const TStr& InFNm, const TIntV& RelevantCols,
651  const char& Separator, TBool HasTitleLine) {
652  // preloaded necessary variables
653  int RowLen = T->Sch.Len();
654  TVec<TAttrType> ColTypes = TVec<TAttrType>(RowLen);
655  for (int i = 0; i < RowLen; i++) {
656  ColTypes[i] = T->GetSchemaColType(i);
657  }
658 
659  // Sequential load
660  TSsParser Ss(InFNm, Separator);
661  // if title line (i.e. names of the columns) is included as first row in the
662  // input file - use it to validate schema
663  if (HasTitleLine) {
664  Ss.Next();
665  if (S.Len() != Ss.GetFlds()) {
666  printf("%s\n", Ss[0]); TExcept::Throw("Table Schema Mismatch!");
667  }
668  for (int i = 0; i < Ss.GetFlds(); i++) {
669  // remove carriage return char
670  int L = strlen(Ss[i]);
671  if (Ss[i][L-1] < ' ') { Ss[i][L-1] = 0; }
672  if (NormalizeColName(S[i].Val1) != NormalizeColName(Ss[i])) { TExcept::Throw("Table Schema Mismatch!"); }
673  }
674  }
675 
676  // populate table columns
677  //printf("starting to populate table\n");
678  uint64 Cnt = 0;
679  while (Ss.Next()) {
680  int IntColIdx = 0;
681  int FltColIdx = 0;
682  int StrColIdx = 0;
683  Assert(Ss.GetFlds() == S.Len()); // compiled only in debug
684  if (Ss.GetFlds() != S.Len()) {
685  printf("%s\n", Ss[S.Len()]); TExcept::Throw("Error reading tsv file");
686  }
687  for (int i = 0; i < RowLen; i++) {
688  switch (ColTypes[i]) {
689  case atInt:
690  if (RelevantCols.Len() == 0) {
691  T->IntCols[IntColIdx].Add(Ss.GetInt(i));
692  } else {
693  T->IntCols[IntColIdx].Add(Ss.GetInt(RelevantCols[i]));
694  }
695  IntColIdx++;
696  break;
697  case atFlt:
698  if (RelevantCols.Len() == 0) {
699  T->FltCols[FltColIdx].Add(Ss.GetFlt(i));
700  } else {
701  T->FltCols[FltColIdx].Add(Ss.GetFlt(RelevantCols[i]));
702  }
703  FltColIdx++;
704  break;
705  case atStr:
706  int ColIdx;
707  if (RelevantCols.Len() == 0) {
708  ColIdx = i;
709  } else {
710  ColIdx = RelevantCols[i];
711  }
712  TStr Sval = TStr(Ss[ColIdx]);
713  T->AddStrVal(StrColIdx, Sval);
714  StrColIdx++;
715  break;
716  }
717  }
718  Cnt += 1;
719  }
720  //printf("finished populating table\n");
721  // set number of rows and "Next" vector
722  T->NumRows = static_cast<int>(Cnt);
723  T->NumValidRows = T->NumRows;
724 
725  T->Next.Clr();
726  T->Next.Gen(static_cast<int>(Cnt));
727  for (uint64 i = 0; i < Cnt-1; i++) {
728  T->Next[static_cast<int>(i)] = static_cast<int>(i+1);
729  }
730  T->IsNextDirty = 0;
731  T->Next[static_cast<int>(Cnt-1)] = Last;
732  T->LastValidRow = T->NumRows - 1;
733 
734  T->InitIds();
735 }
736 
737 PTable TTable::LoadSS(const Schema& S, const TStr& InFNm, TTableContext* Context,
738  const TIntV& RelevantCols, const char& Separator, TBool HasTitleLine) {
739  TVec<uint64> IntGroupByCols;
740  bool NoStringCols = true;
741 
742  // find the schema for the new table which contains only relevant columns
743  Schema SR;
744  if (RelevantCols.Len() == 0) {
745  SR = S;
746  } else {
747  for (int i = 0; i < RelevantCols.Len(); i++) {
748  SR.Add(S[RelevantCols[i]]);
749  }
750  }
751  PTable T = New(SR, Context);
752 
753  // find col types and check for string cols
754  for (int i = 0; i < SR.Len(); i++) {
755  if (T->GetSchemaColType(i) == atStr) {
756  NoStringCols = false;
757  break;
758  }
759  }
760 
761  if (GetMP() && NoStringCols) {
762  // Right now, can load in parallel only in Linux (for mmap) and if
763  // there are no string columns
764 #ifdef GLib_LINUX
765  LoadSSPar(T, S, InFNm, RelevantCols, Separator, HasTitleLine);
766 #else
767  LoadSSSeq(T, S, InFNm, RelevantCols, Separator, HasTitleLine);
768 #endif
769  } else {
770  LoadSSSeq(T, S, InFNm, RelevantCols, Separator, HasTitleLine);
771  }
772  return T;
773 }
774 
775 PTable TTable::LoadSS(const Schema& S, const TStr& InFNm, TTableContext* Context,
776  const char& Separator, TBool HasTitleLine) {
777  return LoadSS(S, InFNm, Context, TIntV(), Separator, HasTitleLine);
778 }
779 
780 void TTable::SaveSS(const TStr& OutFNm) {
781  if (NumValidRows == 0) {
782  printf("Table is empty");
783  return;
784  }
785  FILE* F = fopen(OutFNm.CStr(), "w");
786  // debug
787  if (F == NULL) {
788  printf("failed to open file %s\n", OutFNm.CStr());
789  perror("fail ");
790  return;
791  }
792 
793  Dump(F);
794 
795 #if 0
796  Schema DSch = DenormalizeSchema();
797 
798  TInt L = Sch.Len();
799  // print title (schema)
800  fprintf(F, "# ");
801  for (TInt i = 0; i < L-1; i++) {
802  fprintf(F, "%s\t", DSch[i].Val1.CStr());
803  }
804  fprintf(F, "%s\n", DSch[L-1].Val1.CStr());
805  // print table contents
806  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
807  for (TInt i = 0; i < L; i++) {
808  char C = (i == L-1) ? '\n' : '\t';
809  switch (GetSchemaColType(i)) {
810  case atInt: {
811  fprintf(F, "%d%c", RowI.GetIntAttr(GetSchemaColName(i)).Val, C);
812  break;
813  }
814  case atFlt: {
815  fprintf(F, "%f%c", RowI.GetFltAttr(GetSchemaColName(i)).Val, C);
816  break;
817  }
818  case atStr: {
819  fprintf(F, "%s%c", RowI.GetStrAttr(GetSchemaColName(i)).CStr(), C);
820  break;
821  }
822  }
823  }
824  }
825 #endif
826  fclose(F);
827 }
828 
829 void TTable::SaveBin(const TStr& OutFNm) {
830  TFOut SOut(OutFNm);
831  Save(SOut);
832 }
833 
834 void TTable::Save(TSOut& SOut) {
835  NumRows.Save(SOut);
836  NumValidRows.Save(SOut);
837  FirstValidRow.Save(SOut);
838  LastValidRow.Save(SOut);
839  Next.Save(SOut);
840  IntCols.Save(SOut);
841  FltCols.Save(SOut);
842  StrColMaps.Save(SOut);
843 
844  THash<TStr,TPair<TInt,TInt> > ColTypeIntMap;
845  TInt atIntVal = TInt(0);
846  TInt atFltVal = TInt(1);
847  TInt atStrVal = TInt(2);
848  for (THash<TStr,TPair<TAttrType,TInt> >::TIter it = ColTypeMap.BegI(); it < ColTypeMap.EndI(); it++) {
849  TPair<TAttrType,TInt> dat = it.GetDat();
850  TStr DColName = DenormalizeColName(it.GetKey());
851  switch (dat.GetVal1()) {
852  case atInt:
853  ColTypeIntMap.AddDat(DColName, TPair<TInt,TInt>(atIntVal, dat.GetVal2()));
854  break;
855  case atFlt:
856  ColTypeIntMap.AddDat(DColName, TPair<TInt,TInt>(atFltVal, dat.GetVal2()));
857  break;
858  case atStr:
859  ColTypeIntMap.AddDat(DColName, TPair<TInt,TInt>(atStrVal, dat.GetVal2()));
860  break;
861  }
862  }
863  ColTypeIntMap.Save(SOut);
864  SOut.Flush();
865 }
866 
867 void TTable::Dump(FILE *OutF) const {
868  TInt L = Sch.Len();
869  Schema DSch = DenormalizeSchema();
870 
871  // LoadSS() will not throw away lines with #
872  //fprintf(OutF, "# Table: rows: %d, columns: %d\n", GetNumValidRows(), GetNodes());
873  // print title (schema), LoadSS() will take first line as (optional) schema
874  fprintf(OutF, "# ");
875  for (TInt i = 0; i < L-1; i++) {
876  fprintf(OutF, "%s\t", DSch[i].Val1.CStr());
877  }
878  fprintf(OutF, "%s\n", DSch[L-1].Val1.CStr());
879  // print table contents
880  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
881  for (TInt i = 0; i < L; i++) {
882  char C = (i == L-1) ? '\n' : '\t';
883  switch (GetSchemaColType(i)) {
884  case atInt: {
885  fprintf(OutF, "%d%c", RowI.GetIntAttr(GetSchemaColName(i)).Val, C);
886  break;
887  }
888  case atFlt: {
889  fprintf(OutF, "%f%c", RowI.GetFltAttr(GetSchemaColName(i)).Val, C);
890  break;
891  }
892  case atStr: {
893  fprintf(OutF, "%s%c", RowI.GetStrAttr(GetSchemaColName(i)).CStr(), C);
894  break;
895  }
896  }
897  }
898  }
899 }
900 
902  TInt L = Sch.Len();
903 
904 #if 0
905  // print table on the input, iterate over all columns
906  for (TInt i = 0; i < L; i++) {
907  // skip non-string columns
908  if (GetSchemaColType(i) != atStr) {
909  continue;
910  }
911 
912  TInt ColIdx = GetColIdx(GetSchemaColName(i));
913 
914  // iterate over all rows
915  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
916  TInt RowIdx = RowI.GetRowIdx();
917  TInt KeyId = StrColMaps[ColIdx][RowIdx];
918  printf("ChangeContext in %d %d %d .%s.\n",
919  ColIdx.Val, RowIdx.Val, KeyId.Val, GetStrVal(ColIdx, RowIdx).CStr());
920  }
921  }
922 #endif
923 
924  // add strings to the new context, change values
925  // iterate over all columns
926  for (TInt i = 0; i < L; i++) {
927  // skip non-string columns
928  if (GetSchemaColType(i) != atStr) {
929  continue;
930  }
931 
932  TInt ColIdx = GetColIdx(GetSchemaColName(i));
933 
934  // iterate over all rows
935  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
936  TInt RowIdx = RowI.GetRowIdx();
937  // get the string
938  TStr Key = GetStrVal(ColIdx, RowIdx);
939  // add the string to the new context
940  TInt KeyId = TInt(NewContext->StringVals.AddKey(Key));
941  // change the value in the table
942  StrColMaps[ColIdx][RowIdx] = KeyId;
943  }
944  }
945 
946  // set the new context
947  Context = NewContext;
948  return Context;
949 }
950 
951 void TTable::AddStrVal(const TInt& ColIdx, const TStr& Key) {
952  TInt KeyId = TInt(Context->StringVals.AddKey(Key));
953  //printf("TTable::AddStrVal2 %d .%s. %d\n", ColIdx.Val, Key.CStr(), KeyId.Val);
954  StrColMaps[ColIdx].Add(KeyId);
955 }
956 
957 void TTable::AddStrVal(const TStr& Col, const TStr& Key) {
958  if (GetColType(Col) != atStr) {
959  TExcept::Throw(Col + " is not a string valued column");
960  }
961  //printf("TTable::AddStrVal1 .%s. .%s.\n", Col.CStr(), Key.CStr());
962  AddStrVal(GetColIdx(Col), Key);
963 }
964 
965 void TTable::AddGraphAttribute(const TStr& Attr, TBool IsEdge, TBool IsSrc, TBool IsDst) {
966  if (!IsColName(Attr)) { TExcept::Throw(Attr + ": No such column"); }
967  if (IsEdge) { EdgeAttrV.Add(NormalizeColName(Attr)); }
968  if (IsSrc) { SrcNodeAttrV.Add(NormalizeColName(Attr)); }
969  if (IsDst) { DstNodeAttrV.Add(NormalizeColName(Attr)); }
970 }
971 
972 void TTable::AddGraphAttributeV(TStrV& Attrs, TBool IsEdge, TBool IsSrc, TBool IsDst) {
973  for (TInt i = 0; i < Attrs.Len(); i++) {
974  if (!IsColName(Attrs[i])) {
975  TExcept::Throw(Attrs[i] + ": no such column");
976  }
977  }
978  for (TInt i = 0; i < Attrs.Len(); i++) {
979  if (IsEdge) { EdgeAttrV.Add(NormalizeColName(Attrs[i])); }
980  if (IsSrc) { SrcNodeAttrV.Add(NormalizeColName(Attrs[i])); }
981  if (IsDst) { DstNodeAttrV.Add(NormalizeColName(Attrs[i])); }
982  }
983 }
984 
986  TStrV IntNA = TStrV(IntCols.Len(),0);
987  for (TInt i = 0; i < SrcNodeAttrV.Len(); i++) {
988  TStr Attr = SrcNodeAttrV[i];
989  if (GetColType(Attr) == atInt) {
990  IntNA.Add(Attr);
991  }
992  }
993  return IntNA;
994 }
995 
997  TStrV IntNA = TStrV(IntCols.Len(),0);
998  for (TInt i = 0; i < DstNodeAttrV.Len(); i++) {
999  TStr Attr = DstNodeAttrV[i];
1000  if (GetColType(Attr) == atInt) {
1001  IntNA.Add(Attr);
1002  }
1003  }
1004  return IntNA;
1005 }
1006 
1008  TStrV IntEA = TStrV(IntCols.Len(),0);
1009  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
1010  TStr Attr = EdgeAttrV[i];
1011  if (GetColType(Attr) == atInt) {
1012  IntEA.Add(Attr);
1013  }
1014  }
1015  return IntEA;
1016 }
1017 
1019  TStrV FltNA = TStrV(FltCols.Len(),0);
1020  for (TInt i = 0; i < SrcNodeAttrV.Len(); i++) {
1021  TStr Attr = SrcNodeAttrV[i];
1022  if (GetColType(Attr) == atFlt) {
1023  FltNA.Add(Attr);
1024  }
1025  }
1026  return FltNA;
1027 }
1028 
1030  TStrV FltNA = TStrV(FltCols.Len(),0);
1031  for (TInt i = 0; i < DstNodeAttrV.Len(); i++) {
1032  TStr Attr = DstNodeAttrV[i];
1033  if (GetColType(Attr) == atFlt) {
1034  FltNA.Add(Attr);
1035  }
1036  }
1037  return FltNA;
1038 }
1039 
1041  TStrV FltEA = TStrV(FltCols.Len(),0);;
1042  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
1043  TStr Attr = EdgeAttrV[i];
1044  if (GetColType(Attr) == atFlt) {
1045  FltEA.Add(Attr);
1046  }
1047  }
1048  return FltEA;
1049 }
1050 
1052  TStrV StrNA = TStrV(StrColMaps.Len(),0);
1053  for (TInt i = 0; i < SrcNodeAttrV.Len(); i++) {
1054  TStr Attr = SrcNodeAttrV[i];
1055  if (GetColType(Attr) == atStr) {
1056  StrNA.Add(Attr);
1057  }
1058  }
1059  return StrNA;
1060 }
1061 
1063  TStrV StrNA = TStrV(StrColMaps.Len(),0);
1064  for (TInt i = 0; i < DstNodeAttrV.Len(); i++) {
1065  TStr Attr = DstNodeAttrV[i];
1066  if (GetColType(Attr) == atStr) {
1067  StrNA.Add(Attr);
1068  }
1069  }
1070  return StrNA;
1071 }
1072 
1073 
1075  TStrV StrEA = TStrV(StrColMaps.Len(),0);
1076  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
1077  TStr Attr = EdgeAttrV[i];
1078  if (GetColType(Attr) == atStr) {
1079  StrEA.Add(Attr);
1080  }
1081  }
1082  return StrEA;
1083 }
1084 
1085 void TTable::Rename(const TStr& column, const TStr& NewLabel) {
1086  // This function is necessary, for example to take the union of two tables
1087  // where the attribute names don't match.
1088  if (!IsColName(column)) { TExcept::Throw("no such column " + column); }
1089  TPair<TAttrType,TInt> ColVal = GetColTypeMap(column);
1090  DelColType(column);
1091  AddColType(NewLabel, ColVal);
1092  TStr NColName = NormalizeColName(column);
1093  TStr NLabel = NormalizeColName(NewLabel);
1094  for (TInt c = 0; c < Sch.Len(); c++) {
1095  if (Sch[c].Val1 == NColName) {
1096  Sch.SetVal(c, TPair<TStr, TAttrType>(NLabel, Sch[c].Val2));
1097  break;
1098  }
1099  }
1100 }
1101 
1103  if (FirstValidRow == LastValidRow) {
1104  LastValidRow = -1;
1105  }
1106 
1107  TInt Old = FirstValidRow;
1109  Next[Old] = TTable::Invalid;
1110  NumValidRows--;
1111  TInt IdColIdx = GetColIdx(GetIdColName());
1112  RowIdMap.AddDat(IntCols[IdColIdx][Old], Invalid);
1113 }
1114 
1115 void TTable::RemoveRow(TInt RowIdx, TInt PrevRowIdx) {
1116  if (RowIdx == FirstValidRow) {
1117  RemoveFirstRow();
1118  return;
1119  }
1120  Assert(RowIdx != TTable::Invalid);
1121  if (RowIdx == TTable::Last) { return; }
1122  Next[PrevRowIdx] = Next[RowIdx];
1123  if (LastValidRow == RowIdx) {
1124  LastValidRow = RowIdx;
1125  }
1126  Next[RowIdx] = TTable::Invalid;
1127  NumValidRows--;
1128  TInt IdColIdx = GetColIdx(GetIdColName());
1129  RowIdMap.AddDat(IntCols[IdColIdx][RowIdx], Invalid);
1130 }
1131 
1132 void TTable::KeepSortedRows(const TIntV& KeepV) {
1133  TIntIntH KeepH(KeepV.Len());
1134  for (TInt i = 0; i < KeepV.Len(); i++) {
1135  KeepH.AddKey(KeepV[i]);
1136  }
1137 
1139  TInt KeepSize = 0;
1140  while (RowI.GetNextRowIdx() != Last) {
1141  if (KeepSize < KeepV.Len()) {
1142  if (KeepH.IsKey(RowI.GetNextRowIdx())) {
1143  KeepSize++;
1144  RowI++;
1145  } else {
1146  RowI.RemoveNext();
1147  }
1148  } else {
1149  // Covered all of KeepV. Remove the rest of the rows.
1150  // Current RowI.CurrRowIdx is the last element of KeepV.
1151  RowI.RemoveNext();
1152  }
1153  }
1154  LastValidRow = KeepV[KeepV.Len()-1];
1155 }
1156 
1157 void TTable::GetPartitionRanges(TIntPrV& Partitions, TInt NumPartitions) const {
1158  TInt PartitionSize = NumValidRows / (NumPartitions);
1159  if (NumValidRows % NumPartitions != 0) PartitionSize++;
1160  if (PartitionSize < 10) {
1161  PartitionSize = 10;
1162  NumPartitions = NumValidRows / PartitionSize;
1163  }
1164  Partitions.Reserve(NumPartitions+1);
1165 
1166  TInt currRow = FirstValidRow;
1167  TInt currStart = currRow;
1168  if (IsNextDirty) {
1169  TInt currCount = PartitionSize;
1170  while (currRow != TTable::Last) {
1171  if (currCount == 0) {
1172  Partitions.Add(TIntPr(currStart, currRow));
1173  currStart = currRow;
1174  currCount = PartitionSize;
1175  }
1176  currRow = Next[currRow];
1177  currCount--;
1178  }
1179  Partitions.Add(TIntPr(currStart, currRow));
1180  } else {
1181  // Optimize for the case when rows are logically in sequence.
1182  currRow += PartitionSize;
1183  while (currRow != TTable::Last && currRow < Next.Len()) {
1184  if (Next[currRow] == TTable::Invalid) { currRow++; continue; }
1185  Partitions.Add(TIntPr(currStart, currRow));
1186  currStart = currRow;
1187  currRow += PartitionSize;
1188  }
1189  Partitions.Add(TIntPr(currStart, TTable::Last));
1190  }
1191  //printf("Num partitions: %d\n", Partitions.Len());
1192 }
1193 
1194 /***** Grouping Utility functions ****/
1195 void TTable::GroupingSanityCheck(const TStr& GroupBy, const TAttrType& AttrType) const {
1196  if (!IsColName(GroupBy)) {
1197  TExcept::Throw("no such column " + GroupBy);
1198  }
1199  if (GetColType(GroupBy) != AttrType) {
1200  TExcept::Throw(GroupBy + " values are not of expected type");
1201  }
1202 }
1203 
1204 #ifdef GCC_ATOMIC
1205 void TTable::GroupByIntColMP(const TStr& GroupBy, THashMP<TInt, TIntV>& Grouping, TBool UsePhysicalIds) const {
1206  timeval timer0;
1207  gettimeofday(&timer0, NULL);
1208  double t1 = timer0.tv_sec + (timer0.tv_usec/1000000.0);
1209  //printf("X\n");
1210  TInt IdColIdx = GetColIdx(IdColName);
1211  TInt GroupByColIdx = GetColIdx(GroupBy);
1212  if(!UsePhysicalIds && IdColIdx < 0){
1213  TExcept::Throw("Grouping: Either use physical row ids, or have an id column");
1214  }
1215  //double startFn = omp_get_wtime();
1216  GroupingSanityCheck(GroupBy, atInt);
1217  TIntPrV Partitions;
1218  GetPartitionRanges(Partitions, 8*CHUNKS_PER_THREAD);
1219  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
1220  //double endPart = omp_get_wtime();
1221  //printf("Partition time = %f\n", endPart-startFn);
1222 
1223  Grouping.Gen(NumValidRows);
1224  //double endGen = omp_get_wtime();
1225  //printf("Gen time = %f\n", endGen-endPart);
1226  //printf("S\n");
1227  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) //num_threads(1)
1228  for (int i = 0; i < Partitions.Len(); i++){
1229  TRowIterator RowI(Partitions[i].GetVal1(), this);
1230  TRowIterator EndI(Partitions[i].GetVal2(), this);
1231  while (RowI < EndI) {
1232  TInt idx = UsePhysicalIds ? RowI.GetRowIdx() : RowI.GetIntAttr(IdColIdx);
1233  // printf("updating grouping with key = %d, row_id = %d\n", RowI.GetIntAttr(GroupBy).Val, idx.Val);
1234  UpdateGrouping<TInt>(Grouping, RowI.GetIntAttr(GroupByColIdx), idx);
1235  RowI++;
1236  }
1237  }
1238  gettimeofday(&timer0, NULL);
1239  double t2 = timer0.tv_sec + (timer0.tv_usec/1000000.0);
1240  printf("Grouping time: %f\n", t2 - t1);
1241  //double endAdd = omp_get_wtime();
1242  //printf("Add time = %f\n", endAdd-endGen);
1243 }
1244 #endif // GCC_ATOMIC
1245 
1246 void TTable::Unique(const TStr& Col) {
1247  TIntV RemainingRows;
1248  TStr NCol = NormalizeColName(Col);
1249  switch (GetColType(NCol)) {
1250  case atInt: {
1251  TIntIntVH Grouping;
1252  GroupByIntCol(NCol, Grouping, TIntV(), true, true);
1253  for (TIntIntVH::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1254  RemainingRows.Add(it->Dat[0]);
1255  }
1256  break;
1257  }
1258  case atFlt: {
1259  THash<TFlt,TIntV> Grouping;
1260  GroupByFltCol(NCol, Grouping, TIntV(), true, true);
1261  for (THash<TFlt,TIntV>::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1262  RemainingRows.Add(it->Dat[0]);
1263  }
1264  break;
1265  }
1266  case atStr: {
1267  TIntIntVH Grouping;
1268  GroupByStrCol(NCol, Grouping, TIntV(), true, true);
1269  for (TIntIntVH::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1270  RemainingRows.Add(it->Dat[0]);
1271  }
1272  break;
1273  }
1274  }
1275  KeepSortedRows(RemainingRows);
1276 }
1277 
1278 void TTable::Unique(const TStrV& Cols, TBool Ordered) {
1279  if(Cols.Len() == 1){
1280  Unique(Cols[0]);
1281  return;
1282  }
1283  TStrV NCols = NormalizeColNameV(Cols);
1285  TIntV UniqueVec;
1286  GroupAux(NCols, Grouping, Ordered, "", true, UniqueVec, true);
1287  KeepSortedRows(UniqueVec);
1288 }
1289 
1290 void TTable::StoreGroupCol(const TStr& GroupColName, const TVec<TPair<TInt, TInt> >& GroupAndRowIds) {
1291  // Add a column where the value of the i'th row is the group id of row i.
1293  TInt L = IntCols.Len();
1294  AddColType(GroupColName, atInt, L-1);
1295  // Store group id for each row.
1296  for (TInt i = 0; i < GroupAndRowIds.Len(); i++) {
1297  IntCols[L-1][GroupAndRowIds[i].Val2] = GroupAndRowIds[i].Val1;
1298  }
1299 }
1300 
1301 // Core crouping logic.
1302 void TTable::GroupAux(const TStrV& GroupBy, THash<TGroupKey, TPair<TInt, TIntV> >& Grouping,
1303  TBool Ordered, const TStr& GroupColName, TBool KeepUnique, TIntV& UniqueVec, TBool UsePhysicalIds) {
1304  TInt IdColIdx = GetColIdx(IdColName);
1305  if(!UsePhysicalIds && IdColIdx < 0){
1306  TExcept::Throw("Grouping: Either use physical row ids, or have an id column");
1307  }
1308  TIntV IntGroupByCols;
1309  TIntV FltGroupByCols;
1310  TIntV StrGroupByCols;
1311  // get indices for each column type
1312  for (TInt c = 0; c < GroupBy.Len(); c++) {
1313  //printf("GroupBy col %d: %s\n", c.Val, GroupBy[c].CStr());
1314  if (!IsColName(GroupBy[c])) {
1315  TExcept::Throw("no such column " + GroupBy[c]);
1316  }
1317 
1318  TPair<TAttrType, TInt> ColType = GetColTypeMap(GroupBy[c]);
1319  switch (ColType.Val1) {
1320  case atInt:
1321  IntGroupByCols.Add(ColType.Val2);
1322  break;
1323  case atFlt:
1324  FltGroupByCols.Add(ColType.Val2);
1325  break;
1326  case atStr:
1327  StrGroupByCols.Add(ColType.Val2);
1328  break;
1329  }
1330  }
1331 
1332  TInt IKLen = IntGroupByCols.Len();
1333  TInt FKLen = FltGroupByCols.Len();
1334  TInt SKLen = StrGroupByCols.Len();
1335 
1336  TInt GroupNum = 0;
1337  TVec<TPair<TInt, TInt> > GroupAndRowIds;
1338  //printf("done GroupAux initialization\n");
1339 
1340  // iterate over rows
1341  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
1342  TIntV IKey(IKLen + SKLen, 0);
1343  TFltV FKey(FKLen, 0);
1344  TIntV SKey(SKLen, 0);
1345 
1346  // find group key
1347  for (TInt c = 0; c < IKLen; c++) {
1348  IKey.Add(it.GetIntAttr(IntGroupByCols[c]));
1349  }
1350  for (TInt c = 0; c < FKLen; c++) {
1351  FKey.Add(it.GetFltAttr(FltGroupByCols[c]));
1352  }
1353  for (TInt c = 0; c < SKLen; c++) {
1354  SKey.Add(it.GetStrMapById(StrGroupByCols[c]));
1355  }
1356  if (!Ordered) {
1357  if (IKLen > 0) { IKey.ISort(0, IKey.Len()-1, true); }
1358  if (FKLen > 0) { FKey.ISort(0, FKey.Len()-1, true); }
1359  if (SKLen > 0) { SKey.ISort(0, SKey.Len()-1, true); }
1360  }
1361  for (TInt c = 0; c < SKLen; c++) {
1362  IKey.Add(SKey[c]);
1363  }
1364 
1365  // look for group matching the key
1366  TGroupKey GroupKey = TGroupKey(IKey, FKey);
1367 
1368  TInt RowIdx = it.GetRowIdx();
1369  TInt idx = UsePhysicalIds ? it.GetRowIdx() : IntCols[IdColIdx][it.GetRowIdx()];
1370  if (!Grouping.IsKey(GroupKey)) {
1371  // Grouping key hasn't been seen before, create a new group
1372  TPair<TInt, TIntV> NewGroup;
1373  NewGroup.Val1 = GroupNum;
1374  NewGroup.Val2.Add(idx);
1375  Grouping.AddDat(GroupKey, NewGroup);
1376  if (GroupColName != "") {
1377  GroupAndRowIds.Add(TPair<TInt, TInt>(GroupNum, RowIdx));
1378  }
1379  if (KeepUnique) {
1380  UniqueVec.Add(idx);
1381  }
1382  GroupNum++;
1383  } else {
1384  // Grouping key has been seen before, update corresponding group
1385  if (!KeepUnique) {
1386  TPair<TInt, TIntV>& NewGroup = Grouping.GetDat(GroupKey);
1387  NewGroup.Val2.Add(idx);
1388  if (GroupColName != "") {
1389  GroupAndRowIds.Add(TPair<TInt, TInt>(NewGroup.Val1, RowIdx));
1390  }
1391  }
1392  }
1393  }
1394  // printf("KeepUnique: %d\n", KeepUnique.Val);
1395  // update group mapping
1396  if (!KeepUnique) {
1397  GroupStmt Stmt(NormalizeColNameV(GroupBy), Ordered, UsePhysicalIds);
1398  GroupStmtNames.AddDat(GroupColName, Stmt);
1399  GroupIDMapping.AddKey(Stmt);
1400  GroupMapping.AddKey(Stmt);
1401  //printf("Adding statement: ");
1402  //Stmt.Print();
1403  for (THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1404  TGroupKey key = it.GetKey();
1405  TPair<TInt, TIntV> group = it.GetDat();
1406  GroupIDMapping.GetDat(Stmt).AddDat(group.Val1, TGroupKey(key));
1407  GroupMapping.GetDat(Stmt).AddDat(TGroupKey(key), TIntV(group.Val2));
1408  }
1409  }
1410 
1411  // add a column to the table
1412  if (GroupColName != "") {
1413  StoreGroupCol(GroupColName, GroupAndRowIds);
1414  AddSchemaCol(GroupColName, atInt); // update schema
1415  }
1416 }
1417 
1418 /*
1419 // Core grouping logic.
1420 #ifdef USE_OPENMP
1421 void TTable::GroupAuxMP(const TStrV& GroupBy, THashGenericMP<TGroupKey, TPair<TInt, TIntV> >& Grouping,
1422  TBool Ordered, const TStr& GroupColName, TBool KeepUnique, TIntV& UniqueVec, TBool UsePhysicalIds) {
1423  //double startFn = omp_get_wtime();
1424  TIntV IntGroupByCols;
1425  TIntV FltGroupByCols;
1426  TIntV StrGroupByCols;
1427  // get indices for each column type
1428  for (TInt c = 0; c < GroupBy.Len(); c++) {
1429  if (!IsColName(GroupBy[c])) {
1430  TExcept::Throw("no such column " + GroupBy[c]);
1431  }
1432 
1433  TPair<TAttrType, TInt> ColType = GetColTypeMap(GroupBy[c]);
1434  switch (ColType.Val1) {
1435  case atInt:
1436  IntGroupByCols.Add(ColType.Val2);
1437  break;
1438  case atFlt:
1439  FltGroupByCols.Add(ColType.Val2);
1440  break;
1441  case atStr:
1442  StrGroupByCols.Add(ColType.Val2);
1443  break;
1444  }
1445  }
1446 
1447  TInt IKLen = IntGroupByCols.Len();
1448  TInt FKLen = FltGroupByCols.Len();
1449  TInt SKLen = StrGroupByCols.Len();
1450 
1451  TInt GroupNum = 0;
1452  TInt IdColIdx = GetColIdx(IdColName);
1453 
1454  //double endInit = omp_get_wtime();
1455  //printf("Init time = %f\n", endInit-startFn);
1456 
1457  TVec<TPair<TInt, TInt> > GroupAndRowIds;
1458 
1459  // iterate over rows
1460  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
1461  TIntV IKey(IKLen + SKLen, 0);
1462  TFltV FKey(FKLen, 0);
1463  TIntV SKey(SKLen, 0);
1464 
1465  // find group key
1466  for (TInt c = 0; c < IKLen; c++) {
1467  IKey.Add(it.GetIntAttr(IntGroupByCols[c]));
1468  }
1469  for (TInt c = 0; c < FKLen; c++) {
1470  FKey.Add(it.GetFltAttr(FltGroupByCols[c]));
1471  }
1472  for (TInt c = 0; c < SKLen; c++) {
1473  SKey.Add(it.GetStrMapById(StrGroupByCols[c]));
1474  }
1475  if (!Ordered) {
1476  if (IKLen > 0) { IKey.ISort(0, IKey.Len()-1, true); }
1477  if (FKLen > 0) { FKey.ISort(0, FKey.Len()-1, true); }
1478  if (SKLen > 0) { SKey.ISort(0, SKey.Len()-1, true); }
1479  }
1480  for (TInt c = 0; c < SKLen; c++) {
1481  IKey.Add(SKey[c]);
1482  }
1483 
1484  // look for group matching the key
1485  TGroupKey GroupKey = TGroupKey(IKey, FKey);
1486 
1487  TInt RowIdx = it.GetRowIdx();
1488  if (!Grouping.IsKey(GroupKey)) {
1489  // Grouping key hasn't been seen before, create a new group
1490  TPair<TInt, TIntV> NewGroup;
1491  NewGroup.Val1 = GroupNum;
1492  if(IdColIdx > 0){
1493  NewGroup.Val2.Add(IntCols[IdColIdx][RowIdx]);
1494  }
1495  Grouping.AddDat(GroupKey, NewGroup);
1496  if (GroupColName != "") {
1497  GroupAndRowIds.Add(TPair<TInt, TInt>(GroupNum, RowIdx));
1498  }
1499  if (KeepUnique) {
1500  UniqueVec.Add(RowIdx);
1501  }
1502  GroupNum++;
1503  } else {
1504  // Grouping key has been seen before, update corresponding group
1505  if (!KeepUnique) {
1506  TPair<TInt, TIntV>& NewGroup = Grouping.GetDat(GroupKey);
1507  if(IdColIdx > 0){
1508  NewGroup.Val2.Add(IntCols[IdColIdx][RowIdx]);
1509  }
1510  if (GroupColName != "") {
1511  GroupAndRowIds.Add(TPair<TInt, TInt>(NewGroup.Val1, RowIdx));
1512  }
1513  }
1514  }
1515  }
1516 
1517  //double endIter = omp_get_wtime();
1518  //printf("Iter time = %f\n", endIter-endInit);
1519 
1520  // update group mapping
1521  if (!KeepUnique) {
1522  TPair<TStrV, TBool> GroupStmt(GroupBy, Ordered);
1523  GroupStmtNames.AddDat(GroupColName, GroupStmt);
1524  GroupIDMapping.AddDat(GroupStmt);
1525  GroupMapping.AddDat(GroupStmt);
1526  for (THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Grouping.BegI(); it < Grouping.EndI(); it++) {
1527  TGroupKey key = it.GetKey();
1528  TPair<TInt, TIntV> group = it.GetDat();
1529  GroupIDMapping.GetDat(GroupStmt).AddDat(group.Val1, key);
1530  GroupMapping.GetDat(GroupStmt).AddDat(key, group.Val2);
1531  }
1532  }
1533 
1534  //double endMapping = omp_get_wtime();
1535  //printf("Mapping time = %f\n", endMapping-endIter);
1536 
1537  // add a column to the table
1538  if (GroupColName != "") {
1539  StoreGroupCol(GroupColName, GroupAndRowIds);
1540  AddSchemaCol(GroupColName, atInt); // update schema
1541  }
1542 
1543  //double endStore = omp_get_wtime();
1544  //printf("Store time = %f\n", endStore-endMapping);
1545 }
1546 #endif // USE_OPENMP
1547 */
1548 
1549 void TTable::Group(const TStrV& GroupBy, const TStr& GroupColName, TBool Ordered, TBool UsePhysicalIds) {
1550  TStrV NGroupBy = NormalizeColNameV(GroupBy);
1551  TStr NGroupColName = NormalizeColName(GroupColName);
1552  TIntV UniqueVec;
1554  GroupAux(NGroupBy, Grouping, Ordered, NGroupColName, false, UniqueVec, UsePhysicalIds);
1555 }
1556 
1558  //TODO
1559 }
1560 
1562  //TODO
1563 }
1564 
1565 void TTable::Aggregate(const TStrV& GroupByAttrs, TAttrAggr AggOp,
1566  const TStr& ValAttr, const TStr& ResAttr, TBool Ordered) {
1567 
1568  for (TInt c = 0; c < GroupByAttrs.Len(); c++) {
1569  if (!IsColName(GroupByAttrs[c])) {
1570  TExcept::Throw("no such column " + GroupByAttrs[c]);
1571  }
1572  }
1573 
1574  // double startFn = omp_get_wtime();
1575  TStrV NGroupByAttrs = NormalizeColNameV(GroupByAttrs);
1576  TBool UsePhysicalIds = (GetColIdx(IdColName) < 0);
1577 
1578  THash<TInt,TIntV> GroupByIntMapping;
1579  THash<TFlt,TIntV> GroupByFltMapping;
1580  THash<TInt,TIntV> GroupByStrMapping;
1581  THash<TGroupKey,TIntV> Mapping;
1582 #ifdef GCC_ATOMIC
1583  THashMP<TInt,TIntV> GroupByIntMapping_MP(NumValidRows);
1584  TIntV GroupByIntMPKeys(NumValidRows);
1585 #endif
1586  TInt NumOfGroups = 0;
1587  TInt GroupingCase = 0;
1588 
1589  // check if grouping already exists
1590  GroupStmt Stmt(NGroupByAttrs, Ordered, UsePhysicalIds);
1591  if (GroupMapping.IsKey(Stmt)) {
1592  Mapping = GroupMapping.GetDat(Stmt);
1593  } else{
1594  if(NGroupByAttrs.Len() == 1){
1595  switch(GetColType(NGroupByAttrs[0])){
1596  case atInt:
1597 #ifdef GCC_ATOMIC
1598  if(GetMP()){
1599  GroupByIntColMP(NGroupByAttrs[0], GroupByIntMapping_MP, UsePhysicalIds);
1600  int x = 0;
1601  for(THashMP<TInt,TIntV>::TIter it = GroupByIntMapping_MP.BegI(); it < GroupByIntMapping_MP.EndI(); it++){
1602  GroupByIntMPKeys[x] = it.GetKey();
1603  x++;
1604  /*
1605  printf("%d --> ", it.GetKey().Val);
1606  TIntV& V = it.GetDat();
1607  for(int i = 0; i < V.Len(); i++){
1608  printf(" %d", V[i].Val);
1609  }
1610  printf("\n");
1611  */
1612  }
1613  NumOfGroups = x;
1614  GroupingCase = 4;
1615  //printf("Number of groups: %d\n", NumOfGroups.Val);
1616  break;
1617  }
1618 #endif // GCC_ATOMIC
1619  GroupByIntCol(NGroupByAttrs[0], GroupByIntMapping, TIntV(), true, UsePhysicalIds);
1620  NumOfGroups = GroupByIntMapping.Len();
1621  GroupingCase = 1;
1622  break;
1623  case atFlt:
1624  GroupByFltCol(NGroupByAttrs[0], GroupByFltMapping, TIntV(), true, UsePhysicalIds);
1625  NumOfGroups = GroupByFltMapping.Len();
1626  GroupingCase = 2;
1627  break;
1628  case atStr:
1629  GroupByStrCol(NGroupByAttrs[0], GroupByStrMapping, TIntV(), true, UsePhysicalIds);
1630  NumOfGroups = GroupByStrMapping.Len();
1631  GroupingCase = 3;
1632  break;
1633  }
1634  }
1635  else{
1636  TIntV UniqueVector;
1638  GroupAux(NGroupByAttrs, Mapping_aux, Ordered, "", false, UniqueVector, UsePhysicalIds);
1639  for(THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Mapping_aux.BegI(); it < Mapping_aux.EndI(); it++){
1640  Mapping.AddDat(it.GetKey(), it.GetDat().Val2);
1641  }
1642  NumOfGroups = Mapping.Len();
1643  }
1644  }
1645 
1646  // double endGroup = omp_get_wtime();
1647  // printf("Group time = %f\n", endGroup-startFn);
1648 
1649  TAttrType T = GetColType(ValAttr);
1650 
1651  // add column corresponding to result attribute type
1652  if (AggOp == aaCount) { AddIntCol(ResAttr); }
1653  else {
1654  if (T == atInt) { AddIntCol(ResAttr); }
1655  else if (T == atFlt) { AddFltCol(ResAttr); }
1656  else {
1657  // Count is the only aggregation operation handled for Str
1658  TExcept::Throw("Invalid aggregation for Str type!");
1659  }
1660  }
1661  TInt ColIdx = GetColIdx(ResAttr);
1662  TInt AggrColIdx = GetColIdx(ValAttr);
1663 
1664  // double endAdd = omp_get_wtime();
1665  // printf("AddCol time = %f\n", endAdd-endGroup);
1666 
1667 #ifdef USE_OPENMP
1668  #pragma omp parallel for schedule(dynamic)
1669 #endif
1670  for (int g = 0; g < NumOfGroups; g++) {
1671  TIntV* GroupRows = NULL;
1672  switch(GroupingCase){
1673  case 0:
1674  GroupRows = & Mapping.GetDat(Mapping.GetKey(g));
1675  break;
1676  case 1:
1677  GroupRows = & GroupByIntMapping.GetDat(GroupByIntMapping.GetKey(g));
1678  break;
1679  case 2:
1680  GroupRows = & GroupByIntMapping.GetDat(GroupByIntMapping.GetKey(g));
1681  break;
1682  case 3:
1683  GroupRows = & GroupByStrMapping.GetDat(GroupByStrMapping.GetKey(g));
1684  break;
1685  case 4:
1686 #ifdef GCC_ATOMIC
1687  GroupRows = & GroupByIntMapping_MP.GetDat(GroupByIntMPKeys[g]);
1688 #endif
1689  break;
1690  }
1691 
1692  // find valid rows of group
1693  /*
1694  TIntV ValidRows;
1695  for (TInt i = 0; i < GroupRows.Len(); i++) {
1696  // TODO: This should not be necessary
1697  if (!RowIdMap.IsKey(GroupRows[i])) { continue; }
1698  TInt RowId = RowIdMap.GetDat(GroupRows[i]);
1699  // GroupRows has physical row indices
1700  if (RowId != Invalid) { ValidRows.Add(RowId); }
1701  }
1702  */
1703  TIntV& ValidRows = *GroupRows;
1704  TInt sz = ValidRows.Len();
1705  if (sz <= 0) continue;
1706  // Count is handled separately (other operations have aggregation policies defined in a template)
1707  if (AggOp == aaCount) {
1708  for (TInt i = 0; i < sz; i++) { IntCols[ColIdx][ValidRows[i]] = sz; }
1709  } else {
1710  // aggregate based on column type
1711  if (T == atInt) {
1712  TIntV V;
1713  for (TInt i = 0; i < sz; i++) { V.Add(IntCols[AggrColIdx][ValidRows[i]]); }
1714  TInt Res = AggregateVector<TInt>(V, AggOp);
1715  if (AggOp == aaMean) { Res = Res / sz; }
1716  for (TInt i = 0; i < sz; i++) { IntCols[ColIdx][ValidRows[i]] = Res; }
1717  } else {
1718  TFltV V;
1719  for (TInt i = 0; i < sz; i++) { V.Add(FltCols[AggrColIdx][ValidRows[i]]); }
1720  TFlt Res = AggregateVector<TFlt>(V, AggOp);
1721  if (AggOp == aaMean) { Res /= sz; }
1722  for (TInt i = 0; i < sz; i++) { FltCols[ColIdx][ValidRows[i]] = Res; }
1723  }
1724  }
1725  }
1726  // double endIter = omp_get_wtime();
1727  // printf("Iter time = %f\n", endIter-endAdd);
1728 }
1729 
1730 void TTable::AggregateCols(const TStrV& AggrAttrs, TAttrAggr AggOp, const TStr& ResAttr) {
1732  for (TInt i = 0; i < AggrAttrs.Len(); i++) {
1733  Info.Add(GetColTypeMap(AggrAttrs[i]));
1734  if (Info[i].Val1 != Info[0].Val1) {
1735  TExcept::Throw("AggregateCols: Aggregation attributes must have the same type");
1736  }
1737  }
1738 
1739  if (Info[0].Val1 == atInt) {
1740  AddIntCol(ResAttr);
1741  TInt ResIdx = GetColIdx(ResAttr);
1742 
1743  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1744  TInt RowIdx = RI.GetRowIdx();
1745  TIntV V;
1746  for (TInt i = 0; i < AggrAttrs.Len(); i++) {
1747  V.Add(IntCols[Info[i].Val2][RowIdx]);
1748  }
1749  IntCols[ResIdx][RowIdx] = AggregateVector<TInt>(V, AggOp);
1750  }
1751  } else if (Info[0].Val1 == atFlt) {
1752  AddFltCol(ResAttr);
1753  TInt ResIdx = GetColIdx(ResAttr);
1754 
1755  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1756  TInt RowIdx = RI.GetRowIdx();
1757  TFltV V;
1758  for (TInt i = 0; i < AggrAttrs.Len(); i++) {
1759  V.Add(FltCols[Info[i].Val2][RowIdx]);
1760  }
1761  FltCols[ResIdx][RowIdx] = AggregateVector<TFlt>(V, AggOp);
1762  }
1763  } else {
1764  TExcept::Throw("AggregateCols: Only Int and Flt aggregation supported right now");
1765  }
1766 }
1767 
1769  for(THash<TGroupKey, TIntV>::TIter it = Mapping.BegI(); it < Mapping.EndI(); it++){
1770  TGroupKey gk = it.GetKey();
1771  TIntV ik = gk.Val1;
1772  TFltV fk = gk.Val2;
1773  for(int i = 0; i < ik.Len(); i++){ printf("%d ",ik[i].Val);}
1774  for(int i = 0; i < fk.Len(); i++){ printf("%f ",fk[i].Val);}
1775  printf("-->");
1776  TIntV v = it.GetDat();
1777  for(int i = 0; i < v.Len(); i++){ printf("%d ",v[i].Val);}
1778  printf("\n");
1779  }
1780 }
1781 
1782 void TTable::Count(const TStr& CountColName, const TStr& Col) {
1783  TStrV GroupByAttrs;
1784  GroupByAttrs.Add(CountColName);
1785  Aggregate(GroupByAttrs, aaCount, "", Col);
1786 }
1787 
1788 TVec<PTable> TTable::SpliceByGroup(const TStrV& GroupBy, TBool Ordered) {
1789  TStrV NGroupBy = NormalizeColNameV(GroupBy);
1790  TIntV UniqueVec;
1792  TVec<PTable> Result;
1793 
1794  Schema NewSchema;
1795  for (TInt c = 0; c < Sch.Len(); c++) {
1796  if (Sch[c].Val1 != GetIdColName()) {
1797  NewSchema.Add(Sch[c]);
1798  }
1799  }
1800 
1801  GroupAux(NGroupBy, Grouping, Ordered, "", false, UniqueVec);
1802 
1803  TInt cnt = 0;
1804  // iterate over groups
1805  for (THash<TGroupKey, TPair<TInt, TIntV> >::TIter it = Grouping.BegI(); it != Grouping.EndI(); it++) {
1806  PTable GroupTable = TTable::New(NewSchema, Context);
1807 
1808  TVec<TPair<TAttrType, TInt> > ColInfo;
1809  TIntV V;
1810  for (TInt i = 0; i < Sch.Len(); i++) {
1811  ColInfo.Add(GroupTable->GetColTypeMap(Sch[i].Val1));
1812  if (Sch[i].Val1 == IdColName()) {
1813  ColInfo[i].Val2 = -1;
1814  }
1815  V.Add(GetColIdx(Sch[i].Val1));
1816  }
1817 
1818  TIntV& Rows = it.GetDat().Val2;
1819 
1820  // iterate over rows in group
1821  for (TInt i = 0; i < Rows.Len(); i++) {
1822  // convert from permanent ID to row ID
1823  TInt RowIdx = RowIdMap.GetDat(Rows[i]);
1824 
1825  // iterate over schema
1826  for (TInt c = 0; c < Sch.Len(); c++) {
1827  TPair<TAttrType, TInt> Info = ColInfo[c];
1828  TInt ColIdx = Info.Val2;
1829 
1830  if (ColIdx == -1) { continue; }
1831 
1832  // add row to new group
1833  switch (Info.Val1) {
1834  case atInt:
1835  GroupTable->IntCols[ColIdx].Add(IntCols[V[c]][RowIdx]);
1836  break;
1837  case atFlt:
1838  GroupTable->FltCols[ColIdx].Add(FltCols[V[c]][RowIdx]);
1839  break;
1840  case atStr:
1841  GroupTable->StrColMaps[ColIdx].Add(StrColMaps[V[c]][RowIdx]);
1842  break;
1843  }
1844 
1845  }
1846  if (GroupTable->LastValidRow >= 0) {
1847  GroupTable->Next[GroupTable->LastValidRow] = GroupTable->NumRows;
1848  }
1849  GroupTable->Next.Add(GroupTable->Last);
1850  GroupTable->LastValidRow = GroupTable->NumRows;
1851 
1852  GroupTable->NumRows++;
1853  GroupTable->NumValidRows++;
1854  }
1855  GroupTable->InitIds();
1856  Result.Add(GroupTable);
1857 
1858  cnt += 1;
1859  }
1860  return Result;
1861 }
1862 
1864  IdColName = "_id";
1865  //Assert(NumRows == NumValidRows);
1867 }
1868 
1870  RowIdMap.Clr();
1871  TInt IdColIdx = GetColIdx(IdColName);
1872  TInt IdCnt = 0;
1873  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1874  IntCols[IdColIdx][RI.GetRowIdx()] = IdCnt;
1875  RowIdMap.AddDat(RI.GetRowIdx(), IdCnt);
1876  IdCnt++;
1877  }
1878 }
1879 
1880 void TTable::AddIdColumn(const TStr& ColName) {
1881  //printf("NumRows: %d\n", NumRows.Val);
1882  TInt IdCol = IntCols.Add();
1883  IntCols[IdCol].Reserve(NumRows, NumRows);
1884  //printf("IdCol Reserved\n");
1885  TInt IdCnt = 0;
1886  RowIdMap.Clr();
1887  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
1888  IntCols[IdCol][RI.GetRowIdx()] = IdCnt;
1889  RowIdMap.AddDat(IdCnt, RI.GetRowIdx());
1890  IdCnt++;
1891  }
1892  AddSchemaCol(ColName, atInt);
1893  AddColType(ColName, atInt, IntCols.Len()-1);
1894 }
1895 
1897  PTable JointTable = New(Context);
1898  JointTable->IntCols = TVec<TIntV>(IntCols.Len() + Table.IntCols.Len() + 1);
1899  JointTable->FltCols = TVec<TFltV>(FltCols.Len() + Table.FltCols.Len());
1900  JointTable->StrColMaps = TVec<TIntV>(StrColMaps.Len() + Table.StrColMaps.Len());
1901  for (TInt i = 0; i < Sch.Len(); i++) {
1902  TStr ColName = GetSchemaColName(i);
1903  TAttrType ColType = GetSchemaColType(i);
1904  TStr CName = JointTable->RenumberColName(ColName);
1905  TPair<TAttrType, TInt> TypeMap = GetColTypeMap(ColName);
1906  JointTable->AddColType(CName, TypeMap);
1907  //JointTable->AddLabel(CName, ColName);
1908  JointTable->AddSchemaCol(CName, ColType);
1909  }
1910  for (TInt i = 0; i < Table.Sch.Len(); i++) {
1911  TStr ColName = Table.GetSchemaColName(i);
1912  TAttrType ColType = Table.GetSchemaColType(i);
1913  TStr CName = JointTable->RenumberColName(ColName);
1914  TPair<TAttrType, TInt> NewDat = Table.GetColTypeMap(ColName);
1915  Assert(ColType == NewDat.Val1);
1916  // add offsets
1917  switch (NewDat.Val1) {
1918  case atInt:
1919  NewDat.Val2 += IntCols.Len();
1920  break;
1921  case atFlt:
1922  NewDat.Val2 += FltCols.Len();
1923  break;
1924  case atStr:
1925  NewDat.Val2 += StrColMaps.Len();
1926  break;
1927  }
1928  JointTable->AddColType(CName, NewDat);
1929  JointTable->AddSchemaCol(CName, ColType);
1930  }
1931  TStr IdColName = "_id";
1932  JointTable->AddColType(IdColName, atInt, IntCols.Len() + Table.IntCols.Len());
1933  JointTable->AddSchemaCol(IdColName, atInt);
1934  return JointTable;
1935 }
1936 
1937 void TTable::AddJointRow(const TTable& T1, const TTable& T2, TInt RowIdx1, TInt RowIdx2) {
1938  for (TInt i = 0; i < T1.IntCols.Len(); i++) {
1939  IntCols[i].Add(T1.IntCols[i][RowIdx1]);
1940  }
1941  for (TInt i = 0; i < T1.FltCols.Len(); i++) {
1942  FltCols[i].Add(T1.FltCols[i][RowIdx1]);
1943  }
1944  for (TInt i = 0; i < T1.StrColMaps.Len(); i++) {
1945  StrColMaps[i].Add(T1.StrColMaps[i][RowIdx1]);
1946  }
1947  TInt IntOffset = T1.IntCols.Len();
1948  TInt FltOffset = T1.FltCols.Len();
1949  TInt StrOffset = T1.StrColMaps.Len();
1950  for (TInt i = 0; i < T2.IntCols.Len(); i++) {
1951  IntCols[i+IntOffset].Add(T2.IntCols[i][RowIdx2]);
1952  }
1953  for (TInt i = 0; i < T2.FltCols.Len(); i++) {
1954  FltCols[i+FltOffset].Add(T2.FltCols[i][RowIdx2]);
1955  }
1956  for (TInt i = 0; i < T2.StrColMaps.Len(); i++) {
1957  StrColMaps[i+StrOffset].Add(T2.StrColMaps[i][RowIdx2]);
1958  }
1959  TInt IdOffset = IntOffset + T2.IntCols.Len();
1960  NumRows++;
1961  NumValidRows++;
1962  if (!Next.Empty()) {
1963  Next[Next.Len()-1] = NumValidRows-1;
1965  }
1966  Next.Add(Last);
1968  IntCols[IdOffset].Add(NumRows-1);
1969 }
1970 
1974 PTable TTable::SimJoin(const TStrV& Cols1, const TTable& Table, const TStrV& Cols2, const TStr& DistanceColName, const TSimType& SimType, const TFlt& Threshold)
1975 {
1976  Assert(Cols1.Len() == Cols2.Len());
1977 
1978  if(Cols1.Len()!=Cols2.Len()){
1979  TExcept::Throw("Column vectors must match in type and length");
1980  }
1981 
1982  for (TInt i = 0; i < Cols1.Len(); i++) {
1983  if(!IsColName(Cols1[i]) || !Table.IsColName(Cols2[i])){
1984  TExcept::Throw("Column not found in Table");
1985  }
1986 
1987  TAttrType Type1 = GetColType(Cols1[i]);
1988  TAttrType Type2 = GetColType(Cols2[i]);
1989 
1990  if(Type1!=Type2){
1991  TExcept::Throw("Column types on the two tables must match.");
1992  }
1993 
1994  // When supporting more distance metrics, check if the types are supported for given metric.
1995  if((Type1!=atInt && Type1!=atFlt) || (Type2!=atInt && Type2!=atFlt)){
1996  TExcept::Throw("Column type not supported. Only Flt and Int column types are supported.");
1997  }
1998  }
1999 
2000  // Initialize Join table and add the similarity column
2001  PTable JointTable = InitializeJointTable(Table);
2002  TFltV DistanceV;
2003 
2004  // O(n^2): Parallelize
2005  for(TRowIterator RowI = this->BegRI(); RowI < this->EndRI(); RowI++) {
2006  for(TRowIterator RowI2 = Table.BegRI(); RowI2 < Table.EndRI(); RowI2++) {
2007  float distance = 0;
2008 
2009  switch(SimType)
2010  {
2011  // Calculate the distance metric
2012  case L2Norm:
2013  for(TInt i = 0; i < Cols1.Len(); i++) {
2014  float attrVal1, attrVal2;
2015  attrVal1 = GetColType(Cols1[i])==atInt ? (float)RowI.GetIntAttr(Cols1[i]) : (float)RowI.GetFltAttr(Cols1[i]);
2016  attrVal2 = Table.GetColType(Cols2[i])==atInt ? (float)RowI2.GetIntAttr(Cols2[i]) : (float)RowI2.GetFltAttr(Cols2[i]);
2017  distance += pow(attrVal1 - attrVal2, 2);
2018  }
2019 
2020  distance = sqrt(distance);
2021 
2022  if(distance<=Threshold){
2023  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), RowI2.GetRowIdx());
2024  DistanceV.Add(distance);
2025  }
2026 
2027  // Add row to the joint table if distance <= Threshold
2028  break;
2029  // Haversine distance to calculate the distance between two points on Earth from latitude/longitude
2030  case Haversine:
2031  {
2032  if(Cols1.Len()!=2){
2033  TExcept::Throw("Haversine disance expects exactly two attributes - latitude and longitude - in that order.");
2034  }
2035 
2036  // Block to prevent cross-initialization error from compiler
2037  TFlt Radius = 6373; // km
2038  float Latitude1 = GetColType(Cols1[0])==atInt ? (float)RowI.GetIntAttr(Cols1[0]) : (float)RowI.GetFltAttr(Cols1[0]);
2039  float Latitude2 = Table.GetColType(Cols2[0])==atInt ? (float)RowI2.GetIntAttr(Cols2[0]) : (float)RowI2.GetFltAttr(Cols2[0]);
2040 
2041  float Longitude1 = GetColType(Cols1[1])==atInt ? (float)RowI.GetIntAttr(Cols1[1]) : (float)RowI.GetFltAttr(Cols1[1]);
2042  float Longitude2 = Table.GetColType(Cols2[1])==atInt ? (float)RowI2.GetIntAttr(Cols2[1]) : (float)RowI2.GetFltAttr(Cols2[1]);
2043 
2044  Latitude1 *= static_cast<float>(M_PI/180.0);
2045  Latitude2 *= static_cast<float>(M_PI/180.0);
2046  Longitude1 *= static_cast<float>(M_PI/180.0);
2047  Longitude2 *= static_cast<float>(M_PI/180.0);
2048 
2049  float dlon = Longitude2 - Longitude1;
2050  float dlat = Latitude2 - Latitude1;
2051  float a = pow(sin(dlat/2), 2) + cos(Latitude1)*cos(Latitude2)*pow(sin(dlon/2), 2);
2052  float c = 2*atan2(sqrt(a), sqrt(1-a));
2053  distance = (static_cast<float>(Radius.Val))*c;
2054 
2055  if(distance<=Threshold){
2056  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), RowI2.GetRowIdx());
2057  DistanceV.Add(distance);
2058  }
2059  }
2060  break;
2061  case L1Norm:
2062  case Jaccard:
2063  TExcept::Throw("This distance metric is not supported");
2064  }
2065  }
2066  }
2067 
2068  // Add the value for the similarity column
2069  JointTable->StoreFltCol(DistanceColName, DistanceV);
2070  JointTable->InitIds();
2071  return JointTable;
2072 }
2073 
2074 PTable TTable::SelfSimJoinPerGroup(const TStr& GroupAttr, const TStr& SimCol, const TStr& DistanceColName, const TSimType& SimType, const TFlt& Threshold)
2075 {
2076  if(!IsColName(SimCol) || !IsColName(GroupAttr)){
2077  TExcept::Throw("No such column found in table");
2078  }
2079 
2080  PTable JointTable = New(Context);
2081  // Initialize the joint table - (GroupId1, GroupId2, Similarity)
2082  JointTable->IntCols = TVec<TIntV>(2);
2083  JointTable->FltCols = TVec<TFltV>(1);
2084 
2085  for(TInt i=0;i<2;i++){
2086  TInt Suffix = i+1;
2087  TStr CName = "GroupId_" + Suffix.GetStr();
2089  JointTable->AddColType(CName, Group);
2090  JointTable->AddSchemaCol(CName, atInt);
2091  }
2092 
2094  JointTable->AddColType(DistanceColName, Group);
2095  JointTable->AddSchemaCol(DistanceColName, atFlt);
2096 
2098 
2099  TAttrType attrType = GetColType(SimCol);
2100  TInt GroupColIdx = GetColIdx(GroupAttr);
2101  TInt SimColIdx = GetColIdx(SimCol);
2102 
2103  for (TRowIterator RowI = this->BegRI(); RowI < this->EndRI(); RowI++) {
2104  TInt GroupId = IntCols[GroupColIdx][RowI.GetRowIdx()];
2105 
2106  if(attrType==atInt || attrType==atStr)
2107  {
2108  if(!TIntHH.IsKey(GroupId)){
2110  TIntHH.AddDat(GroupId, TIntH);
2111  }
2112 
2113  THash<TInt, TInt>& TIntH = TIntHH.GetDat(GroupId);
2114  TInt SimAttrVal = (attrType==atInt ? IntCols[SimColIdx][RowI.GetRowIdx()] : StrColMaps[SimColIdx][RowI.GetRowIdx()]);
2115  TIntH.AddDat(SimAttrVal, 0);
2116  }
2117  else
2118  {
2119  TExcept::Throw("Attribute type not supported.");
2120  }
2121  }
2122 
2123  // Iterate through every pair of groups and calculate the distance
2124  for (THash<TInt, THash<TInt, TInt> >::TIter it1 = TIntHH.BegI(); it1 < TIntHH.EndI(); it1++) {
2125  THash<TInt, TInt> Vals1H = it1.GetDat();
2126  TInt GroupId1 = it1.GetKey();
2127 
2128  for (THash<TInt, THash<TInt, TInt> >::TIter it2 = TIntHH.BegI(); it2 < TIntHH.EndI(); it2++) {
2129  int intersectionCount = 0;
2130  TInt GroupId2 = it2.GetKey();
2131  THash<TInt, TInt> Vals2H = it2.GetDat();
2132 
2133  for(THash<TInt, TInt>::TIter it = Vals1H.BegI(); it < Vals1H.EndI(); it++)
2134  {
2135  TInt Val = it.GetKey();
2136  if(Vals2H.IsKey(Val)){
2137  intersectionCount+=1;
2138  }
2139  }
2140 
2141  int unionCount = Vals1H.Len() + Vals2H.Len() - intersectionCount;
2142  float distance = 1.0f - (float)intersectionCount/unionCount;
2143 
2144  // Add a new row to the JointTable
2145  if(distance<=Threshold){
2146  JointTable->IntCols[0].Add(GroupId1);
2147  JointTable->IntCols[1].Add(GroupId2);
2148  JointTable->FltCols[0].Add(distance);
2149  JointTable->IncrementNext();
2150  }
2151  }
2152  }
2153 
2154  JointTable->InitIds();
2155  return JointTable;
2156 }
2157 
2160 PTable TTable::SelfSimJoinPerGroup(const TStrV& GroupBy, const TStr& SimCol,
2161  const TStr& DistanceColName, const TSimType& SimType, const TFlt& Threshold) {
2162  TStrV NGroupBy = NormalizeColNameV(GroupBy);
2163  TStrV ProjectionV;
2164 
2165  // Only keep the GroupBy cols and the SimCol
2166  for(TInt i=0; i<GroupBy.Len(); i++)
2167  {
2168  ProjectionV.Add(GroupBy[i]);
2169  }
2170 
2171  ProjectionV.Add(SimCol);
2172  ProjectInPlace(ProjectionV);
2173 
2174  TStr CName = "Group";
2175  TIntV UniqueVec;
2177  GroupAux(NGroupBy, Grouping, false, CName, false, UniqueVec);
2178  PTable GroupJointTable = SelfSimJoinPerGroup(CName, SimCol, DistanceColName, SimType, Threshold);
2179  PTable JointTable = InitializeJointTable(*this);
2180 
2181  // Hash of groupid to any arbitrary row of that group. Arbitrary because the GroupBy
2182  // columns within that group are the same, so we can choose any one.
2183  THash<TInt, TInt> GroupIdH;
2184 
2185  for(THash<TGroupKey, TPair<TInt, TIntV> >::TIter it=Grouping.BegI(); it<Grouping.EndI(); it++)
2186  {
2187  TPair<TInt, TIntV> group = it.GetDat();
2188  TInt GroupNum = group.Val1;
2189  TIntV RowIds = group.Val2;
2190 
2191  if(!GroupIdH.IsKey(GroupNum))
2192  {
2193  TInt RandomRowId = RowIds[0]; // Arbitrarily select the 1st row.
2194  GroupIdH.AddDat(GroupNum, RandomRowId);
2195  }
2196  }
2197 
2198  for(TRowIterator RowI = GroupJointTable->BegRI(); RowI < GroupJointTable->EndRI(); RowI++)
2199  {
2200  // The GroupJoinTable has a well defined structure - columns 0 and 1 are GroupIds
2201  TInt GroupId1 = GroupJointTable->IntCols[0][RowI.GetRowIdx()];
2202  TInt GroupId2 = GroupJointTable->IntCols[1][RowI.GetRowIdx()];
2203 
2204  // Get the rows for groupid1 and groupid and arbitrary select one row
2205  TInt RowId1 = GroupIdH.GetDat(GroupId1);
2206  TInt RowId2 = GroupIdH.GetDat(GroupId2);
2207  JointTable->AddJointRow(*this, *this, RowId1, RowId2);
2208  }
2209 
2210  // Add the simiarlity column from the GroupJointTable - GroupJointTable has a
2211  // well defined structure - The first float column is the similarity;
2212  JointTable->StoreFltCol(DistanceColName, GroupJointTable->FltCols[0]);
2213  ProjectionV.Clr();
2214  ProjectionV.Add(DistanceColName);
2215 
2216  // Find the GroupBy columns in the JointTable by matching the Suffix of the Schema
2217  // columns with the original GroupBy columns - Note that Join renames columns.
2218  for(TInt i=0; i<GroupBy.Len(); i++){
2219  for(TInt j=0; j<JointTable->Sch.Len(); j++)
2220  {
2221  TStr ColName = JointTable->Sch[j].Val1;
2222  if(ColName.IsStrIn(GroupBy[i]))
2223  {
2224  ProjectionV.Add(ColName);
2225  }
2226  }
2227  }
2228 
2229  JointTable->ProjectInPlace(ProjectionV);
2230  JointTable->InitIds();
2231  return JointTable;
2232 }
2233 
2234 // Increments the next vector and set last, NumRows and NumValidRows.
2236 {
2237  // Advance the Next vector
2238  NumRows++;
2239  NumValidRows++;
2240  if (!Next.Empty()) {
2241  Next[Next.Len()-1] = NumValidRows-1;
2243  }
2244  Next.Add(Last);
2245 }
2246 
2247 // Q: Do we want to have any gurantees in terms of order of the 0t rows - i.e.
2248 // ordered by "this" table row idx as primary key and "Table" row idx as secondary key
2249  // This means only keeping joint row indices (pairs of original row indices), sorting them
2250  // and adding all rows in the end. Sorting can be expensive, but we would be able to pre-allocate
2251  // memory for the joint table..
2252 PTable TTable::Join(const TStr& Col1, const TTable& Table, const TStr& Col2) {
2253  // double startFn = omp_get_wtime();
2254  if (!IsColName(Col1)) {
2255  TExcept::Throw("no such column " + Col1);
2256  printf("no such column %s\n", Col1.CStr());
2257  }
2258  if (!Table.IsColName(Col2)) {
2259  TExcept::Throw("no such column " + Col2);
2260  printf("no such column %s\n", Col2.CStr());
2261  }
2262  if (GetColType(Col1) != Table.GetColType(Col2)) {
2263  TExcept::Throw("Trying to Join on columns of different type");
2264  printf("Trying to Join on columns of different type\n");
2265  }
2266  //printf("passed initial checks\n");
2267  // initialize result table
2268  PTable JointTable = InitializeJointTable(Table);
2269  //printf("initialized joint table\n");
2270  // hash smaller table (group by column)
2271  TAttrType ColType = GetColType(Col1);
2272  TBool ThisIsSmaller = (NumValidRows <= Table.NumValidRows);
2273  const TTable& TS = ThisIsSmaller ? *this : Table;
2274  const TTable& TB = ThisIsSmaller ? Table : *this;
2275  TStr ColS = ThisIsSmaller ? Col1 : Col2;
2276  TStr ColB = ThisIsSmaller ? Col2 : Col1;
2277  TInt ColBId = ThisIsSmaller ? Table.GetColIdx(ColB) : GetColIdx(ColB);
2278  // double endInit = omp_get_wtime();
2279  // printf("Init time = %f\n", endInit-startFn);
2280  // iterate over the rows of the bigger table and check for "collisions"
2281  // with the group keys for the small table.
2282 #ifdef GCC_ATOMIC
2283  if (GetMP()) {
2284  switch(ColType){
2285  case atInt:{
2287  TS.GroupByIntColMP(ColS, T, true);
2288  // double endGroup = omp_get_wtime();
2289  // printf("Group time = %f\n", endGroup-endInit);
2290 
2291  TIntPrV Partitions;
2292  TB.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2293  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2294  TVec<TIntPrV> JointRowIDSet(Partitions.Len());
2295  // double endPart = omp_get_wtime();
2296  // printf("Partition time = %f\n", endPart-endGroup);
2297 
2298  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
2299  for (int i = 0; i < Partitions.Len(); i++){
2300  //double start = omp_get_wtime();
2301  JointRowIDSet[i].Reserve(PartitionSize);
2302  TRowIterator RowI(Partitions[i].GetVal1(), &TB);
2303  TRowIterator EndI(Partitions[i].GetVal2(), &TB);
2304  while (RowI < EndI) {
2305  TInt K = RowI.GetIntAttr(ColBId);
2306  if(T.IsKey(K)){
2307  TIntV& Group = T.GetDat(K);
2308  for(TInt j = 0; j < Group.Len(); j++){
2309  if(ThisIsSmaller){
2310  JointRowIDSet[i].Add(TIntPr(Group[j], RowI.GetRowIdx()));
2311  } else{
2312  JointRowIDSet[i].Add(TIntPr(RowI.GetRowIdx(), Group[j]));
2313  }
2314  }
2315  }
2316  RowI++;
2317  }
2318  //double end = omp_get_wtime();
2319  //printf("END: Thread %d: i = %d, start = %d, end = %d, num = %d, time = %f\n", omp_get_thread_num(), i,
2320  // Partitions[i].GetVal1().Val, Partitions[i].GetVal2().Val, JointRowIDSet[i].Len(), end-start);
2321  }
2322  // double endJoin = omp_get_wtime();
2323  // printf("Iterate time = %f\n", endJoin-endPart);
2324  JointTable->AddNJointRowsMP(*this, Table, JointRowIDSet);
2325  // double endAdd = omp_get_wtime();
2326  // printf("Add time = %f\n", endAdd-endJoin);
2327  break;
2328  }
2329  case atFlt:{
2331  TS.GroupByFltCol(ColS, T, TIntV(), true);
2332 
2333  TIntPrV Partitions;
2334  TB.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2335  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2336  TVec<TIntPrV> JointRowIDSet(Partitions.Len());
2337 
2338  #pragma omp parallel for schedule(dynamic)
2339  for (int i = 0; i < Partitions.Len(); i++){
2340  JointRowIDSet[i].Reserve(PartitionSize);
2341  TRowIterator RowI(Partitions[i].GetVal1(), &TB);
2342  TRowIterator EndI(Partitions[i].GetVal2(), &TB);
2343  while (RowI < EndI) {
2344  TFlt K = RowI.GetFltAttr(ColBId);
2345  if(T.IsKey(K)){
2346  TIntV& Group = T.GetDat(K);
2347  for(TInt j = 0; j < Group.Len(); j++){
2348  if(ThisIsSmaller){
2349  JointRowIDSet[i].Add(TIntPr(Group[j], RowI.GetRowIdx()));
2350  } else{
2351  JointRowIDSet[i].Add(TIntPr(RowI.GetRowIdx(), Group[j]));
2352  }
2353  }
2354  }
2355  RowI++;
2356  }
2357  }
2358  JointTable->AddNJointRowsMP(*this, Table, JointRowIDSet);
2359  break;
2360  }
2361  case atStr:{
2363  TS.GroupByStrCol(ColS, T, TIntV(), true);
2364 
2365  TIntPrV Partitions;
2366  TB.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2367  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2368  TVec<TIntPrV> JointRowIDSet(Partitions.Len());
2369 
2370  #pragma omp parallel for schedule(dynamic)
2371  for (int i = 0; i < Partitions.Len(); i++){
2372  JointRowIDSet[i].Reserve(PartitionSize);
2373  TRowIterator RowI(Partitions[i].GetVal1(), &TB);
2374  TRowIterator EndI(Partitions[i].GetVal2(), &TB);
2375  while (RowI < EndI) {
2376  TInt K = RowI.GetStrMapById(ColBId);
2377  if(T.IsKey(K)){
2378  TIntV& Group = T.GetDat(K);
2379  for(TInt j = 0; j < Group.Len(); j++){
2380  if(ThisIsSmaller){
2381  JointRowIDSet[i].Add(TIntPr(Group[j], RowI.GetRowIdx()));
2382  } else{
2383  JointRowIDSet[i].Add(TIntPr(RowI.GetRowIdx(), Group[j]));
2384  }
2385  }
2386  }
2387  RowI++;
2388  }
2389  }
2390  JointTable->AddNJointRowsMP(*this, Table, JointRowIDSet);
2391  }
2392  break;
2393  }
2394  } else {
2395 #endif // GCC_ATOMIC
2396  switch (ColType) {
2397  case atInt:{
2398  TIntIntVH T;
2399  TS.GroupByIntCol(ColS, T, TIntV(), true);
2400  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2401  TInt K = RowI.GetIntAttr(ColBId);
2402  if (T.IsKey(K)) {
2403  TIntV& Group = T.GetDat(K);
2404  for (TInt i = 0; i < Group.Len(); i++) {
2405  if (ThisIsSmaller) {
2406  JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx());
2407  } else {
2408  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]);
2409  }
2410  }
2411  }
2412  }
2413  break;
2414  }
2415  case atFlt:{
2417  TS.GroupByFltCol(ColS, T, TIntV(), true);
2418  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2419  TFlt K = RowI.GetFltAttr(ColBId);
2420  if (T.IsKey(K)) {
2421  TIntV& Group = T.GetDat(K);
2422  for (TInt i = 0; i < Group.Len(); i++) {
2423  if (ThisIsSmaller) {
2424  JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx());
2425  } else {
2426  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]);
2427  }
2428  }
2429  }
2430  }
2431  break;
2432  }
2433  case atStr:{
2434  TIntIntVH T;
2435  TS.GroupByStrCol(ColS, T, TIntV(), true);
2436  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2437  TInt K = RowI.GetStrMapById(ColBId);
2438  if (T.IsKey(K)) {
2439  TIntV& Group = T.GetDat(K);
2440  for (TInt i = 0; i < Group.Len(); i++) {
2441  if (ThisIsSmaller) {
2442  JointTable->AddJointRow(*this, Table, Group[i], RowI.GetRowIdx());
2443  } else {
2444  JointTable->AddJointRow(*this, Table, RowI.GetRowIdx(), Group[i]);
2445  }
2446  }
2447  }
2448  }
2449  }
2450  break;
2451  }
2452 #ifdef GCC_ATOMIC
2453  }
2454 #endif
2455  return JointTable;
2456 }
2457 
2458 void TTable::ThresholdJoinInputCorrectness(const TStr& KeyCol1, const TStr& JoinCol1, const TTable& Table,
2459  const TStr& KeyCol2, const TStr& JoinCol2){
2460  if (!IsColName(KeyCol1)) {
2461  printf("no such column %s\n", KeyCol1.CStr());
2462  TExcept::Throw("no such column " + KeyCol1);
2463  }
2464  if (!Table.IsColName(KeyCol2)) {
2465  printf("no such column %s\n", KeyCol2.CStr());
2466  TExcept::Throw("no such column " + KeyCol2);
2467  }
2468  if (!IsColName(JoinCol1)) {
2469  printf("no such column %s\n", JoinCol1.CStr());
2470  TExcept::Throw("no such column " + JoinCol1);
2471  }
2472  if (!Table.IsColName(JoinCol2)) {
2473  printf("no such column %s\n", JoinCol2.CStr());
2474  TExcept::Throw("no such column " + JoinCol2);
2475  }
2476  if (GetColType(JoinCol1) != Table.GetColType(JoinCol2)) {
2477  printf("Trying to Join on columns of different type\n");
2478  TExcept::Throw("Trying to Join on columns of different type");
2479  }
2480  if (GetColType(KeyCol1) != Table.GetColType(KeyCol2)) {
2481  printf("Key type mismatch\n");
2482  TExcept::Throw("Key type mismatch");
2483  }
2484 }
2485 
2487  const TIntIntVH& T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS,
2488  THash<TIntPr,TIntTr>& Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType){
2489  // iterate over big table and count / record joint tuples
2490  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2491  // value to join on from big table
2492  TInt JVal = 0;
2493  if(JoinColType == atStr){
2494  JVal = RowI.GetStrMapById(JoinColIdxB);
2495  } else{
2496  JVal = RowI.GetIntAttr(JoinColIdxB);
2497  }
2498  //printf("JVal: %d\n", JVal.Val);
2499  if(T.IsKey(JVal)){
2500  // read key attribute of big table row
2501  TInt KeyB = 0;
2502  if(KeyType == atStr){
2503  KeyB = RowI.GetStrMapById(KeyColIdxB);
2504  } else{
2505  KeyB = RowI.GetIntAttr(KeyColIdxB);
2506  }
2507  // read row ids from small table with join attribute value of JVal
2508  const TIntV& RelevantRows = T.GetDat(JVal);
2509  for(int i = 0; i < RelevantRows.Len(); i++){
2510  // read key attribute of relevant row from small table
2511  TInt KeyS = 0;
2512  if(KeyType == atStr){
2513  KeyS = TS.StrColMaps[KeyColIdxS][RelevantRows[i]];
2514  } else{
2515  KeyS = TS.IntCols[KeyColIdxS][RelevantRows[i]];
2516  }
2517  // create a pair of keys - serves as a key in Counters
2518  TIntPr Keys = ThisIsSmaller ? TIntPr(KeyS, KeyB) : TIntPr(KeyB, KeyS);
2519  if(Counters.IsKey(Keys)){
2520  // if the key pair has been seen before - increment its counter by 1
2521  TIntTr& V = Counters.GetDat(Keys);
2522  V.Val3 = V.Val3 + 1;
2523  } else{
2524  // if the key pair hasn't been seen before - add it with value of
2525  // row indices that create a joint record with this key pair
2526  if(ThisIsSmaller){
2527  Counters.AddDat(Keys, TIntTr(RelevantRows[i], RowI.GetRowIdx(),1));
2528  } else{
2529  Counters.AddDat(Keys, TIntTr(RowI.GetRowIdx(), RelevantRows[i],1));
2530  }
2531  }
2532  } // end of for loop
2533  } // end of if statement
2534  } // end of for loop
2535 }
2536 
2538  const TIntIntVH& T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS,
2539  THash<TIntTr,TIntTr>& Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType){
2540  for (TRowIterator RowI = TB.BegRI(); RowI < TB.EndRI(); RowI++) {
2541  // value to join on from big table
2542  TInt JVal = 0;
2543  if(JoinColType == atStr){
2544  JVal = RowI.GetStrMapById(JoinColIdxB);
2545  } else{
2546  JVal = RowI.GetIntAttr(JoinColIdxB);
2547  }
2548  //printf("JVal: %d\n", JVal.Val);
2549  if(T.IsKey(JVal)){
2550  // read key attribute of big table row
2551  TInt KeyB = 0;
2552  if(KeyType == atStr){
2553  KeyB = RowI.GetStrMapById(KeyColIdxB);
2554  } else{
2555  KeyB = RowI.GetIntAttr(KeyColIdxB);
2556  }
2557  // read row ids from small table with join attribute value of JVal
2558  const TIntV& RelevantRows = T.GetDat(JVal);
2559  for(int i = 0; i < RelevantRows.Len(); i++){
2560  // read key attribute of relevant row from small table
2561  TInt KeyS = 0;
2562  if(KeyType == atStr){
2563  KeyS = TS.StrColMaps[KeyColIdxS][RelevantRows[i]];
2564  } else{
2565  KeyS = TS.IntCols[KeyColIdxS][RelevantRows[i]];
2566  }
2567  // create a pair of keys - serves as a key in Counters
2568  TIntPr Keys = ThisIsSmaller ? TIntPr(KeyS, KeyB) : TIntPr(KeyB, KeyS);
2569  TIntTr K(Keys.Val1,Keys.Val2,JVal);
2570  if(Counters.IsKey(K)){
2571  // if the key pair has been seen before - increment its counter by 1
2572  TIntTr& V = Counters.GetDat(K);
2573  V.Val3 = V.Val3 + 1;
2574  } else{
2575  // if the key pair hasn't been seen before - add it with value of
2576  // row indices that create a joint record with this key pair
2577  if(ThisIsSmaller){
2578  Counters.AddDat(K, TIntTr(RelevantRows[i], RowI.GetRowIdx(),1));
2579  } else{
2580  Counters.AddDat(K, TIntTr(RowI.GetRowIdx(), RelevantRows[i],1));
2581  }
2582  }
2583  } // end of for loop
2584  } // end of if statement
2585  } // end of for loop
2586  }
2587 
2588 PTable TTable::ThresholdJoinOutputTable(const THash<TIntPr,TIntTr>& Counters, TInt Threshold, const TTable& Table){
2589  // initialize result table
2590  PTable JointTable = InitializeJointTable(Table);
2591  for(THash<TIntPr,TIntTr>::TIter iter = Counters.BegI(); iter < Counters.EndI(); iter++){
2592  TIntTr& Counter = iter.GetDat();
2593  //printf("keys: %d, %d\n", iter.GetKey().Val1.Val, iter.GetKey().Val2.Val);
2594  //printf("selected rows: %d,%d, counter: %d\n", Counter.Val1.Val, Counter.Val2.Val, Counter.Val3.Val);
2595  if(Counter.Val3 >= Threshold){
2596  JointTable->AddJointRow(*this, Table, Counter.Val1, Counter.Val2);
2597  }
2598  }
2599  return JointTable;
2600 }
2601 
2603  PTable JointTable = InitializeJointTable(Table);
2604  for(THash<TIntTr,TIntTr>::TIter iter = Counters.BegI(); iter < Counters.EndI(); iter++){
2605  const TIntTr& Counter = iter.GetDat();
2606  const TIntTr& Keys = iter.GetKey();
2607  THashSet<TIntPr> Pairs;
2608  if(Counter.Val3 >= Threshold){
2609  TIntPr K(Keys.Val1,Keys.Val2);
2610  if(!Pairs.IsKey(K)){
2611  Pairs.AddKey(K);
2612  JointTable->AddJointRow(*this, Table, Counter.Val1, Counter.Val2);
2613  }
2614  }
2615  }
2616  return JointTable;
2617 }
2618 
2619 
2620 // expected output: one joint tuple (R1,R2) with:
2621 // (1) R1[KeyCol1] = K1 and R2[KeyCol2] = K2
2622 // for every pair of keys (K1,K2) such that the number of joint tuples
2623 // (joint on R1[JoinCol1] = R2[JointCol2]) that hold property (1) is at least Threshold
2624 PTable TTable::ThresholdJoin(const TStr& KeyCol1, const TStr& JoinCol1, const TTable& Table,
2625  const TStr& KeyCol2, const TStr& JoinCol2, TInt Threshold, TBool PerJoinKey){
2626  // test input correctness
2627  ThresholdJoinInputCorrectness(KeyCol1, JoinCol1, Table, KeyCol2, JoinCol2);
2628  //printf("verified input correctness\n");
2629  // type of column on which we join (currently support only int)
2630  TAttrType JoinColType = GetColType(JoinCol1);
2631  // type of key column (currently support only int)
2632  TAttrType KeyType = GetColType(KeyCol1);
2633  // Determine which table is smaller
2634  TBool ThisIsSmaller = (NumValidRows <= Table.NumValidRows);
2635  const TTable& TS = ThisIsSmaller ? *this : Table;
2636  const TTable& TB = ThisIsSmaller ? Table : *this;
2637  TStr JoinColS = JoinCol1;
2638  TInt JoinColIdxB = GetColIdx(JoinCol2);
2639  TInt KeyColIdxS = GetColIdx(KeyCol1);
2640  TInt KeyColIdxB = GetColIdx(KeyCol2);
2641  if(!ThisIsSmaller){
2642  JoinColS = JoinCol2;
2643  JoinColIdxB = GetColIdx(JoinCol1);
2644  KeyColIdxS = GetColIdx(KeyCol2);
2645  KeyColIdxB = GetColIdx(KeyCol1);
2646  }
2647 
2648  // debug print
2649  //printf("JoinColS = %d, JoinColIdxB = %d, KeyColIdxS = %d, KeyColIdxB = %d\n",
2650  //GetColIdx(JoinColS).Val, JoinColIdxB.Val, KeyColIdxS.Val, KeyColIdxB.Val);
2651  //printf("starting switch-case\n");
2652 
2653  if(KeyType != atInt && KeyType != atStr){
2654  printf("ThresholdJoin only supports integer or string key attributes\n");
2655  TExcept::Throw("ThresholdJoin only supports integer or string key attributes");
2656  }
2657  if(JoinColType != atInt && JoinColType != atStr){
2658  printf("ThresholdJoin only supports integer or string join attributes\n");
2659  TExcept::Throw("ThresholdJoin only supports integer or string join attributes");
2660  }
2661  //printf("starting the real stuff!\n");
2662  // hash the smaller table T: join col value --> physical row ids of rows with that value
2663  TIntIntVH T;
2664  if(JoinColType == atInt){
2665  TS.GroupByIntCol(JoinColS, T, TIntV(), true);
2666  } else if(JoinColType == atStr){
2667  TS.GroupByStrCol(JoinColS, T, TIntV(), true);
2668  } else{
2669  TExcept::Throw("ThresholdJoin only supports integer or string join attributes");
2670  }
2671 
2672  /*
2673  for(THash<TInt,TIntV>::TIter it = T.BegI(); it < T.EndI(); it++){
2674  if(JoinColType == atStr){
2675  printf("%s -->", Context.StringVals.GetKey(it.GetKey().Val));
2676  } else{
2677  printf("%d -->", it.GetKey().Val);
2678  }
2679  const TIntV& V = it.GetDat();
2680  for(int sr = 0; sr < V.Len(); sr++){
2681  printf(" %d", V[sr].Val);
2682  }
2683  printf("\n");
2684  }
2685  */
2686 
2687  // Counters: (K1,K2) --> (RowIdx1,RowIdx2, count) where K1 is a key from KeyCol1,
2688  // K2 is a key from Table's KeyCol2; RowIdx1 and RowIdx2 are physical row ids
2689  // that participates in a joint tuple that satisfies (1).
2690  // count is the count of joint records that satisfy (1).
2691  // In case of string attributes - the integer mappings of the key attribute values are used.
2692  if(PerJoinKey){
2693  //printf("PerJoinKey\n");
2694  THash<TIntTr,TIntTr> Counters;
2695  ThresholdJoinCountPerJoinKeyCollisions(TB, TS, T, JoinColIdxB, KeyColIdxB, KeyColIdxS, Counters, ThisIsSmaller, JoinColType, KeyType);
2696  /*
2697  for(THash<TIntTr,TIntTr>::TIter it = Counters.BegI(); it < Counters.EndI(); it++){
2698  const TIntTr& K = it.GetKey();
2699  const TIntTr& V = it.GetDat();
2700  if(KeyType == atStr){
2701  printf("%s %s --> %d %d %d\n", Context->StringVals.GetKey(K.Val1), Context->StringVals.GetKey(K.Val2), V.Val1.Val, V.Val2.Val, V.Val3.Val);
2702  } else{
2703  printf("%d %d --> %d %d %d\n", K.Val1.Val, K.Val2.Val, V.Val1.Val, V.Val2.Val, V.Val3.Val);
2704  }
2705  }
2706  */
2707  //printf("found collisions\n");
2708  return ThresholdJoinPerJoinKeyOutputTable(Counters, Threshold, Table);
2709  } else{
2710  //printf("not PerJoinKey\n");
2711  THash<TIntPr,TIntTr> Counters;
2712  ThresholdJoinCountCollisions(TB, TS, T, JoinColIdxB, KeyColIdxB, KeyColIdxS, Counters, ThisIsSmaller, JoinColType, KeyType);
2713  /*
2714  for(THash<TIntPr,TIntTr>::TIter it = Counters.BegI(); it < Counters.EndI(); it++){
2715  const TIntPr& K = it.GetKey();
2716  const TIntTr& V = it.GetDat();
2717  if(KeyType == atStr){
2718  printf("%s %s --> %d %d %d\n", Context->StringVals.GetKey(K.Val1), Context->StringVals.GetKey(K.Val2), V.Val1.Val, V.Val2.Val, V.Val3.Val);
2719  } else{
2720  printf("%d %d --> %d %d %d\n", K.Val1.Val, K.Val2.Val, V.Val1.Val, V.Val2.Val, V.Val3.Val);
2721  }
2722  }
2723  */
2724  //printf("found collisions\n");
2725  return ThresholdJoinOutputTable(Counters, Threshold, Table);
2726  }
2727 }
2728 
2729 
2730 void TTable::Select(TPredicate& Predicate, TIntV& SelectedRows, TBool Remove) {
2731  TIntV Selected;
2732  TStrV RelevantCols;
2733  Predicate.GetVariables(RelevantCols);
2734  TInt NumRelevantCols = RelevantCols.Len();
2735  TVec<TAttrType> ColTypes = TVec<TAttrType>(NumRelevantCols);
2736  TIntV ColIndices = TIntV(NumRelevantCols);
2737  for (TInt i = 0; i < NumRelevantCols; i++) {
2738  ColTypes[i] = GetColType(RelevantCols[i]);
2739  ColIndices[i] = GetColIdx(RelevantCols[i]);
2740  }
2741 
2742  if (Remove) {
2744  while (RowI.GetNextRowIdx() != Last) {
2745  // prepare arguments for predicate evaluation
2746  for (TInt i = 0; i < NumRelevantCols; i++) {
2747  switch (ColTypes[i]) {
2748  case atInt:
2749  Predicate.SetIntVal(RelevantCols[i], RowI.GetNextIntAttr(ColIndices[i]));
2750  break;
2751  case atFlt:
2752  Predicate.SetFltVal(RelevantCols[i], RowI.GetNextFltAttr(ColIndices[i]));
2753  break;
2754  case atStr:
2755  Predicate.SetStrVal(RelevantCols[i], RowI.GetNextStrAttr(ColIndices[i]));
2756  break;
2757  }
2758  }
2759  if (!Predicate.Eval()) {
2760  RowI.RemoveNext();
2761  } else {
2762  RowI++;
2763  }
2764  }
2765  } else {
2766  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
2767  for (TInt i = 0; i < NumRelevantCols; i++) {
2768  switch (ColTypes[i]) {
2769  case atInt:
2770  Predicate.SetIntVal(RelevantCols[i], RowI.GetIntAttr(RelevantCols[i]));
2771  break;
2772  case atFlt:
2773  Predicate.SetFltVal(RelevantCols[i], RowI.GetFltAttr(RelevantCols[i]));
2774  break;
2775  case atStr:
2776  Predicate.SetStrVal(RelevantCols[i], RowI.GetStrAttr(RelevantCols[i]));
2777  break;
2778  }
2779  }
2780  if (Predicate.Eval()) { SelectedRows.Add(RowI.GetRowIdx()); }
2781  }
2782  }
2783 }
2784 
2785 void TTable::Classify(TPredicate& Predicate, const TStr& LabelName, const TInt& PositiveLabel, const TInt& NegativeLabel) {
2786  TIntV SelectedRows;
2787  Select(Predicate, SelectedRows, false);
2788  ClassifyAux(SelectedRows, LabelName, PositiveLabel, NegativeLabel);
2789 }
2790 
2791 
2792 // Further optimization: both comparison operation and type of columns don't change between rows..
2793 void TTable::SelectAtomic(const TStr& Col1, const TStr& Col2, TPredComp Cmp, TIntV& SelectedRows, TBool Remove) {
2794  const TAttrType Ty1 = GetColType(Col1);
2795  const TAttrType Ty2 = GetColType(Col2);
2796  const TInt ColIdx1 = GetColIdx(Col1);
2797  const TInt ColIdx2 = GetColIdx(Col2);
2798  if (Ty1 != Ty2) {
2799  TExcept::Throw("SelectAtomic: diff types");
2800  }
2801  if (Cmp == SUBSTR || Cmp == SUPERSTR) { Assert(Ty1 == atStr); }
2802 
2803  if (Remove) {
2805  while (RowI.GetNextRowIdx() != Last) {
2806 
2807  TBool Result;
2808  switch (Ty1) {
2809  case atInt:
2810  Result = TPredicate::EvalAtom(RowI.GetNextIntAttr(ColIdx1), RowI.GetNextIntAttr(ColIdx2), Cmp);
2811  break;
2812  case atFlt:
2813  Result = TPredicate::EvalAtom(RowI.GetNextFltAttr(ColIdx1), RowI.GetNextFltAttr(ColIdx2), Cmp);
2814  break;
2815  case atStr:
2816  Result = TPredicate::EvalStrAtom(RowI.GetNextStrAttr(ColIdx1), RowI.GetNextStrAttr(ColIdx2), Cmp);
2817  break;
2818  }
2819 
2820  if (!Result) {
2821  RowI.RemoveNext();
2822  } else {
2823  RowI++;
2824  }
2825 
2826  }
2827  } else {
2828  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
2829  TBool Result;
2830  switch (Ty1) {
2831  case atInt:
2832  Result = TPredicate::EvalAtom(RowI.GetIntAttr(Col1), RowI.GetIntAttr(Col2), Cmp);
2833  break;
2834  case atFlt:
2835  Result = TPredicate::EvalAtom(RowI.GetFltAttr(Col1), RowI.GetFltAttr(Col2), Cmp);
2836  break;
2837  case atStr:
2838  Result = TPredicate::EvalStrAtom(RowI.GetStrAttr(Col1), RowI.GetStrAttr(Col2), Cmp);
2839  break;
2840  }
2841  if (Result) { SelectedRows.Add(RowI.GetRowIdx()); }
2842  }
2843  }
2844 }
2845 
2846 void TTable::ClassifyAtomic(const TStr& Col1, const TStr& Col2, TPredComp Cmp,
2847  const TStr& LabelName, const TInt& PositiveLabel, const TInt& NegativeLabel) {
2848  TIntV SelectedRows;
2849  SelectAtomic(Col1, Col2, Cmp, SelectedRows, false);
2850  ClassifyAux(SelectedRows, LabelName, PositiveLabel, NegativeLabel);
2851 }
2852 
2854  TIntV& SelectedRows, PTable& SelectedTable, TBool Remove, TBool Table) {
2855  //double startFn = omp_get_wtime();
2856  TStr ValTStr(Val.GetStr());
2857  TAttrType Type = GetColType(Col);
2858  TInt ColIdx = GetColIdx(Col);
2859 
2860  if (Type != Val.GetType()) {
2861  TExcept::Throw("SelectAtomicConst: coltype does not match const type");
2862  }
2863 
2864  if(Remove){
2865 #ifdef USE_OPENMP
2866  if (GetMP()) {
2867  //double endInit = omp_get_wtime();
2868  //printf("Init time = %f\n", endInit-startFn);
2869  TIntPrV Partitions;
2870  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2871  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2872  int RemoveCount = 0;
2873  //double endPart = omp_get_wtime();
2874  //printf("Partition time = %f\n", endPart-endInit);
2875 
2876  TIntPrV Bounds(Partitions.Len());
2877 
2878  // #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) reduction(+:RemoveCount) shared(Val)
2879  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) reduction(+:RemoveCount)
2880  for (int i = 0; i < Partitions.Len(); i++){
2881  //TPrimitive ThreadLocalVal(Val);
2882  TRowIterator RowI(Partitions[i].GetVal1(), this);
2883  TRowIterator EndI(Partitions[i].GetVal2(), this);
2884  TInt FirstRowIdx = TTable::Invalid;
2885  TInt LastRowIdx = TTable::Invalid;
2886  TBool First = true;
2887  while (RowI < EndI) {
2888  TInt CurrRowIdx = RowI.GetRowIdx();
2889  TBool Result;
2890  if (Type != atStr) {
2891  Result = RowI.CompareAtomicConst(ColIdx, Val, Cmp);
2892  } else {
2893  Result = RowI.CompareAtomicConstTStr(ColIdx, ValTStr, Cmp);
2894  }
2895  RowI++;
2896  if(!Result) {
2897  Next[CurrRowIdx] = TTable::Invalid;
2898  RemoveCount++;
2899  } else {
2900  if (First) { FirstRowIdx = CurrRowIdx; First = false; }
2901  else { Next[LastRowIdx] = CurrRowIdx; }
2902  LastRowIdx = CurrRowIdx;
2903  }
2904  }
2905  Bounds[i] = TIntPr(FirstRowIdx, LastRowIdx);
2906  //printf("Thread %d: i = %d, start = %d, end = %d\n", omp_get_thread_num(), i,
2907  // Partitions[i].GetVal1().Val, Partitions[i].GetVal2().Val);
2908  }
2909  //double endIter = omp_get_wtime();
2910  //printf("Iter time = %f\n", endIter-endPart);
2911 
2912  // repair the next vector
2913  TInt CurrBound = 0;
2914  while (CurrBound < Bounds.Len() && Bounds[CurrBound].Val1 == TTable::Invalid) {
2915  CurrBound++;
2916  }
2917  if (CurrBound == Bounds.Len()) {
2918  // selected table is empty
2919  Assert(NumValidRows == RemoveCount);
2920  NumValidRows = 0;
2923  } else {
2924  NumValidRows -= RemoveCount;
2925  FirstValidRow = Bounds[CurrBound].Val1;
2926  LastValidRow = Bounds[CurrBound].Val2;
2927  TInt PrevBound = CurrBound;
2928  CurrBound++;
2929  while (CurrBound < Bounds.Len()) {
2930  if (Bounds[CurrBound].Val1 == TTable::Invalid) { CurrBound++; continue; }
2931  Next[Bounds[PrevBound].Val2] = Bounds[CurrBound].Val1;
2932  LastValidRow = Bounds[CurrBound].Val2;
2933  PrevBound = CurrBound;
2934  CurrBound++;
2935  }
2936  Next[Bounds[PrevBound].Val2] = TTable::Last;
2937  }
2938  IsNextDirty = 1;
2939  //double endRepair = omp_get_wtime();
2940  //printf("Repair time = %f\n", endRepair-endIter);
2941  } else {
2942 #endif
2944  while(RowI.GetNextRowIdx() != Last){
2945  if (!RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
2946  RowI.RemoveNext();
2947  } else {
2948  RowI++;
2949  }
2950  }
2951  IsNextDirty = 1;
2952 #ifdef USE_OPENMP
2953  }
2954 #endif
2955  } else if (Table) {
2956 #ifdef USE_OPENMP
2957  if (GetMP()) {
2958  //double endInit = omp_get_wtime();
2959  //printf("Init time = %f\n", endInit-startFn);
2960  TIntPrV Partitions;
2961  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
2962  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
2963  //double endPart = omp_get_wtime();
2964  //printf("Partition time = %f\n", endPart-endInit);
2965 
2966  int TotalSelectedRows = 0;
2967  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) reduction(+:TotalSelectedRows)
2968  for (int i = 0; i < Partitions.Len(); i++){
2969  TRowIterator RowI(Partitions[i].GetVal1(), this);
2970  TRowIterator EndI(Partitions[i].GetVal2(), this);
2971  while (RowI < EndI) {
2972  if (Type != atStr) {
2973  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
2974  TotalSelectedRows++;
2975  }
2976  } else {
2977  if (RowI.CompareAtomicConstTStr(ColIdx, ValTStr, Cmp)) {
2978  TotalSelectedRows++;
2979  }
2980  }
2981  RowI++;
2982  }
2983  }
2984  //double endCount = omp_get_wtime();
2985  //printf("Count time = %f\n", endCount-endPart);
2986 
2987  SelectedTable->ResizeTable(TotalSelectedRows);
2988  //double endResize = omp_get_wtime();
2989  //printf("Resize time = %f\n", endResize-endCount);
2990 
2991  if (TotalSelectedRows == 0) {
2992  // printf("Select: Empty output!\n");
2993  return;
2994  }
2995 
2996  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
2997  for (int i = 0; i < Partitions.Len(); i++){
2998  TIntV LocalSelectedRows;
2999  LocalSelectedRows.Reserve(PartitionSize);
3000  TRowIterator RowI(Partitions[i].GetVal1(), this);
3001  TRowIterator EndI(Partitions[i].GetVal2(), this);
3002  while (RowI < EndI) {
3003  if (Type != atStr) {
3004  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
3005  LocalSelectedRows.Add(RowI.GetRowIdx());
3006  }
3007  } else {
3008  if (RowI.CompareAtomicConstTStr(ColIdx, ValTStr, Cmp)) {
3009  LocalSelectedRows.Add(RowI.GetRowIdx());
3010  }
3011  }
3012  RowI++;
3013  }
3014  SelectedTable->AddSelectedRows(*this, LocalSelectedRows);
3015  //printf("Thread %d: i = %d, start = %d, end = %d\n", omp_get_thread_num(), i,
3016  // Partitions[i].GetVal1().Val, Partitions[i].GetVal2().Val);
3017  }
3018  //double endIter = omp_get_wtime();
3019  //printf("Iter time = %f\n", endIter-endResize);
3020 
3021  //SelectedTable->ResizeTable(SelectedTable->GetNumValidRows());
3022  //double endResize2 = omp_get_wtime();
3023  //printf("Resize2 time = %f\n", endResize2-endIter);
3024  SelectedTable->SetFirstValidRow();
3025  } else {
3026 #endif
3027  for(TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++){
3028  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
3029  SelectedTable->AddRow(RowI);
3030  }
3031  }
3032 #ifdef USE_OPENMP
3033  }
3034 #endif
3035  } else {
3036  for(TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++){
3037  if (RowI.CompareAtomicConst(ColIdx, Val, Cmp)) {
3038  SelectedRows.Add(RowI.GetRowIdx());
3039  }
3040  }
3041  }
3042 }
3043 
3044 inline TInt TTable::CompareRows(TInt R1, TInt R2, const TAttrType& CompareByType, const TInt& CompareByIndex, TBool Asc) {
3045  //printf("comparing rows %d %d by %s\n", R1.Val, R2.Val, CompareBy.CStr());
3046  switch (CompareByType) {
3047  case atInt:{
3048  if (IntCols[CompareByIndex][R1] > IntCols[CompareByIndex][R2]) { return (Asc ? 1 : -1); }
3049  if (IntCols[CompareByIndex][R1] < IntCols[CompareByIndex][R2]) { return (Asc ? -1 : 1); }
3050  return 0;
3051  }
3052  case atFlt:{
3053  if (FltCols[CompareByIndex][R1] > FltCols[CompareByIndex][R2]) { return (Asc ? 1 : -1); }
3054  if (FltCols[CompareByIndex][R1] < FltCols[CompareByIndex][R2]) { return (Asc ? -1 : 1); }
3055  return 0;
3056  }
3057  case atStr:{
3058  TStr S1 = GetStrVal(CompareByIndex, R1);
3059  TStr S2 = GetStrVal(CompareByIndex, R2);
3060  int CmpRes = strcmp(S1.CStr(), S2.CStr());
3061  return (Asc ? CmpRes : -CmpRes);
3062  }
3063  }
3064  // code should not come here, added to remove a compiler warning
3065  return 0;
3066 }
3067 
3068 inline TInt TTable::CompareRows(TInt R1, TInt R2, const TVec<TAttrType>& CompareByTypes, const TIntV& CompareByIndices, TBool Asc) {
3069  for (TInt i = 0; i < CompareByTypes.Len(); i++) {
3070  TInt res = CompareRows(R1, R2, CompareByTypes[i], CompareByIndices[i], Asc);
3071  if (res != 0) { return res; }
3072  }
3073  return 0;
3074 }
3075 
3076 void TTable::ISort(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3077  if (StartIdx < EndIdx) {
3078  for (TInt i = StartIdx+1; i <= EndIdx; i++) {
3079  TInt Val = V[i];
3080  TInt j = i;
3081  while ((StartIdx < j) && (CompareRows(V[j-1], Val, SortByTypes, SortByIndices, Asc) > 0)) {
3082  V[j] = V[j-1];
3083  j--;
3084  }
3085  V[j] = Val;
3086  }
3087  }
3088 }
3089 
3090 TInt TTable::GetPivot(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3091  TInt L = EndIdx - StartIdx + 1;
3092  const TInt Idx1 = StartIdx + TInt::GetRnd(L);
3093  const TInt Idx2 = StartIdx + TInt::GetRnd(L);
3094  const TInt Idx3 = StartIdx + TInt::GetRnd(L);
3095  if (CompareRows(V[Idx1], V[Idx2], SortByTypes, SortByIndices, Asc) < 0) {
3096  if (CompareRows(V[Idx2], V[Idx3], SortByTypes, SortByIndices, Asc) < 0) { return Idx2; }
3097  if (CompareRows(V[Idx1], V[Idx3], SortByTypes, SortByIndices, Asc) < 0) { return Idx3; }
3098  return Idx1;
3099  } else {
3100  if (CompareRows(V[Idx3], V[Idx2], SortByTypes, SortByIndices, Asc) < 0) { return Idx2; }
3101  if (CompareRows(V[Idx3], V[Idx1], SortByTypes, SortByIndices, Asc) < 0) { return Idx3; }
3102  return Idx1;
3103  }
3104 }
3105 
3106 TInt TTable::Partition(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3107 
3108  // test if the elements are already sorted
3109  TInt j;
3110  for (j = StartIdx; j < EndIdx; j++) {
3111  if (CompareRows(V[j], V[j+1], SortByTypes, SortByIndices, Asc) > 0) {
3112  break;
3113  }
3114  }
3115  if (j >= EndIdx) {
3116  return EndIdx+1;
3117  }
3118 
3119  TInt PivotIdx = GetPivot(V, StartIdx, EndIdx, SortByTypes, SortByIndices, Asc);
3120  TInt Pivot = V[PivotIdx];
3121  V.Swap(PivotIdx, EndIdx);
3122  TInt StoreIdx = StartIdx;
3123  for (TInt i = StartIdx; i < EndIdx; i++) {
3124  if (CompareRows(V[i], Pivot, SortByTypes, SortByIndices, Asc) <= 0) {
3125  V.Swap(i, StoreIdx);
3126  StoreIdx++;
3127  }
3128  }
3129  // move pivot value to its place
3130  V.Swap(StoreIdx, EndIdx);
3131  return StoreIdx;
3132 }
3133 
3134 void TTable::QSort(TIntV& V, TInt StartIdx, TInt EndIdx, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3135  if (StartIdx < EndIdx) {
3136  if (EndIdx - StartIdx < 20) {
3137  ISort(V, StartIdx, EndIdx, SortByTypes, SortByIndices, Asc);
3138  } else {
3139  TInt Pivot = Partition(V, StartIdx, EndIdx, SortByTypes, SortByIndices, Asc);
3140  if (Pivot > EndIdx) {
3141  return;
3142  }
3143  // Everything <= Pivot will be in StartIdx, Pivot-1. Shrink this
3144  // range to ignore elements equal to the pivot in the first
3145  // recursive call, to optimize for the case when a lot of
3146  // rows are equal.
3147  int Ub = Pivot - 1;
3148  while (Ub >= StartIdx && CompareRows(
3149  V[Ub], V[Pivot], SortByTypes, SortByIndices, Asc) == 0) {
3150  Ub -= 1;
3151  }
3152  QSort(V, StartIdx, Ub, SortByTypes, SortByIndices, Asc);
3153  QSort(V, Pivot+1, EndIdx, SortByTypes, SortByIndices, Asc);
3154  }
3155  }
3156 }
3157 
3158 void TTable::Merge(TIntV& V, TInt Idx1, TInt Idx2, TInt Idx3, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3159  TInt i = Idx1, j = Idx2;
3160  TIntV SortedV;
3161  while (i < Idx2 && j < Idx3) {
3162  if (CompareRows(V[i], V[j], SortByTypes, SortByIndices, Asc) <= 0) {
3163  SortedV.Add(V[i]);
3164  i++;
3165  }
3166  else {
3167  SortedV.Add(V[j]);
3168  j++;
3169  }
3170  }
3171  while (i < Idx2) {
3172  SortedV.Add(V[i]);
3173  i++;
3174  }
3175  while (j < Idx3) {
3176  SortedV.Add(V[j]);
3177  j++;
3178  }
3179 
3180  for (TInt sz = 0; sz < Idx3 - Idx1; sz++) {
3181  V[Idx1 + sz] = SortedV[sz];
3182  }
3183 }
3184 
3185 #ifdef USE_OPENMP
3186 void TTable::QSortPar(TIntV& V, const TVec<TAttrType>& SortByTypes, const TIntV& SortByIndices, TBool Asc) {
3187  TInt NumThreads = 8; // Setting this to 8 because that results in the fastest sorting on Madmax.
3188  TInt Sz = V.Len();
3189  TIntV IndV, NextV;
3190  for (TInt i = 0; i < NumThreads; i++) {
3191  IndV.Add(i * (Sz / NumThreads));
3192  }
3193  IndV.Add(Sz);
3194 
3195  omp_set_num_threads(NumThreads);
3196  #pragma omp parallel for
3197  for (int i = 0; i < NumThreads; i++) {
3198  QSort(V, IndV[i], IndV[i+1] - 1, SortByTypes, SortByIndices, Asc);
3199  }
3200 
3201  while (NumThreads > 1) {
3202  omp_set_num_threads(NumThreads / 2);
3203  #pragma omp parallel for
3204  for (int i = 0; i < NumThreads; i += 2) {
3205  Merge(V, IndV[i], IndV[i+1], IndV[i+2], SortByTypes, SortByIndices, Asc);
3206  }
3207 
3208  NextV.Clr();
3209  for (TInt i = 0; i < NumThreads; i+=2) {
3210  NextV.Add(IndV[i]);
3211  }
3212  NextV.Add(Sz);
3213  IndV = NextV;
3214 
3215  NumThreads = NumThreads / 2;
3216  }
3217 }
3218 #endif // USE_OPENMP
3219 
3220 void TTable::Order(const TStrV& OrderBy, TStr OrderColName, TBool ResetRankByMSC, TBool Asc) {
3221  // get a vector of all valid row indices
3222  TIntV ValidRows = TIntV(NumValidRows);
3223  if (NumRows == NumValidRows) {
3224  for (TInt i = 0; i < NumValidRows; i++) {
3225  ValidRows[i] = i;
3226  }
3227  } else {
3228  TInt i = 0;
3229  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
3230  ValidRows[i] = RI.GetRowIdx();
3231  i++;
3232  }
3233  }
3234  TVec<TAttrType> OrderByTypes(OrderBy.Len());
3235  TIntV OrderByIndices(OrderBy.Len());
3236  for (TInt i = 0; i < OrderBy.Len(); i++) {
3237  OrderByTypes[i] = GetColType(OrderBy[i]);
3238  OrderByIndices[i] = GetColIdx(OrderBy[i]);
3239  }
3240 
3241  // sort that vector according to the attributes given in "OrderBy" in lexicographic order
3242 #ifdef USE_OPENMP
3243  if (GetMP()) {
3244  QSortPar(ValidRows, OrderByTypes, OrderByIndices, Asc);
3245  } else {
3246 #endif
3247  QSort(ValidRows, 0, NumValidRows-1, OrderByTypes, OrderByIndices, Asc);
3248 #ifdef USE_OPENMP
3249  }
3250 #endif
3251 
3252  // rewire Next vector
3253  IsNextDirty = 1;
3254  if (NumValidRows > 0) {
3255  FirstValidRow = ValidRows[0];
3256  } else {
3257  FirstValidRow = Last;
3258  }
3259  for (TInt i = 0; i < NumValidRows-1; i++) {
3260  Next[ValidRows[i]] = ValidRows[i+1];
3261  }
3262  if (NumValidRows > 0) {
3263  Next[ValidRows[NumValidRows-1]] = Last;
3264  LastValidRow = ValidRows[NumValidRows-1];
3265  } else {
3266  LastValidRow = Last;
3267  }
3268 
3269  // add rank column
3270  if (!OrderColName.Empty()) {
3271  TIntV RankCol = TIntV(NumRows);
3272  for (TInt i = 0; i < NumValidRows; i++) {
3273  RankCol[ValidRows[i]] = i;
3274  }
3275  if (ResetRankByMSC) {
3276  for (TInt i = 1; i < NumValidRows; i++) {
3277  TStr GroupName = OrderBy[0];
3278  if (GetStrVal(GroupName, ValidRows[i]) != GetStrVal(GroupName, ValidRows[i-1])) {
3279  RankCol[ValidRows[i]] = 0;
3280  } else {
3281  RankCol[ValidRows[i]] = RankCol[ValidRows[i-1]] + 1;
3282  }
3283  }
3284  }
3285  IntCols.Add(RankCol);
3286  AddSchemaCol(OrderColName, atInt);
3287  AddColType(OrderColName, atInt, IntCols.Len()-1);
3288  }
3289 }
3290 
3292  TInt FreeIndex = 0;
3293  TIntV Mapping; // Mapping[old_index] = new_index/invalid
3294 
3295  TInt IdColIdx = GetColIdx(IdColName);
3296 
3297  for (TInt i = 0; i < Next.Len(); i++) {
3298  if (Next[i] != TTable::Invalid) {
3299  // "first row" properly set beforehand
3300  if (FreeIndex == 0) {
3301  Assert (i == FirstValidRow);
3302  FirstValidRow = 0;
3303  }
3304 
3305  if (Next[i] != Last) {
3306  Next[FreeIndex] = FreeIndex + 1;
3307  Mapping.Add(FreeIndex);
3308  } else {
3309  Next[FreeIndex] = Last;
3310  LastValidRow = FreeIndex;
3311  Mapping.Add(Last);
3312  }
3313 
3314  RowIdMap.AddDat(IntCols[IdColIdx][i], FreeIndex);
3315 
3316  for (TInt j = 0; j < IntCols.Len(); j++) {
3317  IntCols[j][FreeIndex] = IntCols[j][i];
3318  }
3319  for (TInt j = 0; j < FltCols.Len(); j++) {
3320  FltCols[j][FreeIndex] = FltCols[j][i];
3321  }
3322  for (TInt j = 0; j < StrColMaps.Len(); j++) {
3323  StrColMaps[j][FreeIndex] = StrColMaps[j][i];
3324  }
3325 
3326  FreeIndex++;
3327  } else {
3328  NumRows--;
3329  Mapping.Add(TTable::Invalid);
3330  }
3331  }
3332 
3333  // should match, or bug somewhere
3335 }
3336 
3338  if (N == 0) {
3339  LastValidRow = -1;
3340  return;
3341  }
3342  TRowIterator RowI = BegRI();
3343  TInt count = 1;
3344  while (count < N) {
3345  if (!(RowI < EndRI())) {
3346  return; // The table contains less than N rows
3347  }
3348  RowI++;
3349  count++;
3350  }
3351  NumValidRows = N;
3352  TInt LastId = RowI.GetRowIdx();
3353  if (Next[LastId] == Last) {
3354  return; // The table contains exactly N rows
3355  }
3356  // The table contains more than N rows
3357  TInt CurrId = LastId;
3358  while (Next[CurrId] != Last) {
3359  Assert(Next[CurrId] != Invalid);
3360  TInt NextId = Next[CurrId];
3361  Next[CurrId] = Invalid;
3362  CurrId = NextId;
3363  }
3364  Next[LastId] = Last;
3365  LastValidRow = LastId;
3366 }
3367 
3368 inline void TTable::CheckAndAddIntNode(PNEANet Graph, THashSet<TInt>& NodeVals, TInt NodeId) {
3369  if (!NodeVals.IsKey(NodeId)) {
3370  Graph->AddNode(NodeId);
3371  NodeVals.AddKey(NodeId);
3372  }
3373 }
3374 
3375 inline void TTable::AddEdgeAttributes(PNEANet& Graph, int RowId) {
3376  for (TInt i = 0; i < EdgeAttrV.Len(); i++) {
3377  TStr ColName = EdgeAttrV[i];
3378  TAttrType T = GetColType(ColName);
3379  TInt Index = GetColIdx(ColName);
3380  switch (T) {
3381  case atInt:
3382  Graph->AddIntAttrDatE(RowId, IntCols[Index][RowId], ColName);
3383  break;
3384  case atFlt:
3385  Graph->AddFltAttrDatE(RowId, FltCols[Index][RowId], ColName);
3386  break;
3387  case atStr:
3388  Graph->AddStrAttrDatE(RowId, GetStrVal(Index, RowId), ColName);
3389  break;
3390  }
3391  }
3392 }
3393 
3394 inline void TTable::AddNodeAttributes(TInt NId, TStrV NodeAttrV, TInt RowId, THash<TInt, TStrIntVH>& NodeIntAttrs,
3395  THash<TInt, TStrFltVH>& NodeFltAttrs, THash<TInt, TStrStrVH>& NodeStrAttrs) {
3396  for (TInt i = 0; i < NodeAttrV.Len(); i++) {
3397  TStr ColAttr = NodeAttrV[i];
3398  TAttrType CT = GetColType(ColAttr);
3399  int ColId = GetColIdx(ColAttr);
3400  // check if this is a common src-dst attribute
3401  for (TInt i = 0; i < CommonNodeAttrs.Len(); i++) {
3402  if (CommonNodeAttrs[i].Val1 == ColAttr || CommonNodeAttrs[i].Val2 == ColAttr) {
3403  ColAttr = CommonNodeAttrs[i].Val3;
3404  break;
3405  }
3406  }
3407  if (CT == atInt) {
3408  if (!NodeIntAttrs.IsKey(NId)) { NodeIntAttrs.AddKey(NId); }
3409  if (!NodeIntAttrs.GetDat(NId).IsKey(ColAttr)) { NodeIntAttrs.GetDat(NId).AddKey(ColAttr); }
3410  NodeIntAttrs.GetDat(NId).GetDat(ColAttr).Add(IntCols[ColId][RowId]);
3411  } else if (CT == atFlt) {
3412  if (!NodeFltAttrs.IsKey(NId)) { NodeFltAttrs.AddKey(NId); }
3413  if (!NodeFltAttrs.GetDat(NId).IsKey(ColAttr)) { NodeFltAttrs.GetDat(NId).AddKey(ColAttr); }
3414  NodeFltAttrs.GetDat(NId).GetDat(ColAttr).Add(FltCols[ColId][RowId]);
3415  } else {
3416  if (!NodeStrAttrs.IsKey(NId)) { NodeStrAttrs.AddKey(NId); }
3417  if (!NodeStrAttrs.GetDat(NId).IsKey(ColAttr)) { NodeStrAttrs.GetDat(NId).AddKey(ColAttr); }
3418  NodeStrAttrs.GetDat(NId).GetDat(ColAttr).Add(GetStrVal(ColId, RowId));
3419  }
3420  }
3421 }
3422 
3423 // Makes one pass over all the rows in the vector RowIds, and builds
3424 // a PNEANet, with each row as an edge between SrcCol and DstCol.
3425 PNEANet TTable::BuildGraph(const TIntV& RowIds, TAttrAggr AggrPolicy) {
3426  PNEANet Graph = TNEANet::New();
3427 
3428  const TAttrType NodeType = GetColType(SrcCol);
3429  Assert(NodeType == GetColType(DstCol));
3430  const TInt SrcColIdx = GetColIdx(SrcCol);
3431  const TInt DstColIdx = GetColIdx(DstCol);
3432 
3433  // node values - i.e. the unique values of src/dst col
3434  //THashSet<TInt> IntNodeVals; // for both int and string node attr types.
3435  THash<TFlt, TInt> FltNodeVals;
3436 
3437  // node attributes
3438  THash<TInt, TStrIntVH> NodeIntAttrs;
3439  THash<TInt, TStrFltVH> NodeFltAttrs;
3440  THash<TInt, TStrStrVH> NodeStrAttrs;
3441 
3442  // make single pass over all rows in given row id set
3443  for (TVec<TInt>::TIter it = RowIds.BegI(); it < RowIds.EndI(); it++) {
3444  TInt CurrRowIdx = *it;
3445 
3446  // add src and dst nodes to graph if they are not seen earlier
3447  TInt SVal, DVal;
3448  if (NodeType == atFlt) {
3449  TFlt FSVal = FltCols[SrcColIdx][CurrRowIdx];
3450  SVal = CheckAndAddFltNode(Graph, FltNodeVals, FSVal);
3451  TFlt FDVal = FltCols[SrcColIdx][CurrRowIdx];
3452  DVal = CheckAndAddFltNode(Graph, FltNodeVals, FDVal);
3453  } else if (NodeType == atInt || NodeType == atStr) {
3454  if (NodeType == atInt) {
3455  SVal = IntCols[SrcColIdx][CurrRowIdx];
3456  DVal = IntCols[DstColIdx][CurrRowIdx];
3457  } else {
3458  SVal = StrColMaps[SrcColIdx][CurrRowIdx];
3459  if (strlen(Context->StringVals.GetKey(SVal)) == 0) { continue; } //illegal value
3460  DVal = StrColMaps[DstColIdx][CurrRowIdx];
3461  if (strlen(Context->StringVals.GetKey(DVal)) == 0) { continue; } //illegal value
3462  }
3463  if (!Graph->IsNode(SVal)) { Graph->AddNode(SVal); }
3464  if (!Graph->IsNode(DVal)) { Graph->AddNode(DVal); }
3465  //CheckAndAddIntNode(Graph, IntNodeVals, SVal);
3466  //CheckAndAddIntNode(Graph, IntNodeVals, DVal);
3467  }
3468 
3469  // add edge and edge attributes
3470  Graph->AddEdge(SVal, DVal, CurrRowIdx);
3471  if (EdgeAttrV.Len() > 0) { AddEdgeAttributes(Graph, CurrRowIdx); }
3472 
3473  // get src and dst node attributes into hashmaps
3474  if (SrcNodeAttrV.Len() > 0) {
3475  AddNodeAttributes(SVal, SrcNodeAttrV, CurrRowIdx, NodeIntAttrs, NodeFltAttrs, NodeStrAttrs);
3476  }
3477  if (DstNodeAttrV.Len() > 0) {
3478  AddNodeAttributes(DVal, DstNodeAttrV, CurrRowIdx, NodeIntAttrs, NodeFltAttrs, NodeStrAttrs);
3479  }
3480  }
3481 
3482  // aggregate node attributes and add to graph
3483  if (SrcNodeAttrV.Len() > 0 || DstNodeAttrV.Len() > 0) {
3484  for (TNEANet::TNodeI NodeI = Graph->BegNI(); NodeI < Graph->EndNI(); NodeI++) {
3485  TInt NId = NodeI.GetId();
3486  if (NodeIntAttrs.IsKey(NId)) {
3487  TStrIntVH IntAttrVals = NodeIntAttrs.GetDat(NId);
3488  for (TStrIntVH::TIter it = IntAttrVals.BegI(); it < IntAttrVals.EndI(); it++) {
3489  TInt AttrVal = AggregateVector<TInt>(it.GetDat(), AggrPolicy);
3490  Graph->AddIntAttrDatN(NId, AttrVal, it.GetKey());
3491  }
3492  }
3493  if (NodeFltAttrs.IsKey(NId)) {
3494  TStrFltVH FltAttrVals = NodeFltAttrs.GetDat(NId);
3495  for (TStrFltVH::TIter it = FltAttrVals.BegI(); it < FltAttrVals.EndI(); it++) {
3496  TFlt AttrVal = AggregateVector<TFlt>(it.GetDat(), AggrPolicy);
3497  Graph->AddFltAttrDatN(NId, AttrVal, it.GetKey());
3498  }
3499  }
3500  if (NodeStrAttrs.IsKey(NId)) {
3501  TStrStrVH StrAttrVals = NodeStrAttrs.GetDat(NId);
3502  for (TStrStrVH::TIter it = StrAttrVals.BegI(); it < StrAttrVals.EndI(); it++) {
3503  TStr AttrVal = AggregateVector<TStr>(it.GetDat(), AggrPolicy);
3504  Graph->AddStrAttrDatN(NId, AttrVal, it.GetKey());
3505  }
3506  }
3507  }
3508  }
3509 
3510  return Graph;
3511 }
3512 
3513 
3514 
3515 void TTable::InitRowIdBuckets(int NumBuckets) {
3516  for (TInt i = 0; i < RowIdBuckets.Len(); i++) {
3517  RowIdBuckets[i].Clr();
3518  }
3519  RowIdBuckets.Clr();
3520 
3521  RowIdBuckets.Gen(NumBuckets);
3522  for (TInt i = 0; i < NumBuckets; i++) {
3523  RowIdBuckets[i].Gen(10, 0);
3524  }
3525 }
3526 
3527 void TTable::FillBucketsByWindow(TStr SplitAttr, TInt JumpSize, TInt WindowSize, TInt StartVal, TInt EndVal) {
3528  Assert (JumpSize <= WindowSize);
3529  int NumBuckets, MinBucket, MaxBucket;
3530  TInt SplitColId = GetColIdx(SplitAttr);
3531 
3532  if (StartVal == TInt::Mn || EndVal == TInt::Mx) {
3533  // calculate min and max value of the column 'SplitAttr'
3534  TInt MinValue = TInt::Mx;
3535  TInt MaxValue = TInt::Mn;
3536  for (TInt i = 0; i < Next.Len(); i++) {
3537  if (Next[i] != Invalid) {
3538  if (MinValue > IntCols[SplitColId][i]) {
3539  MinValue = IntCols[SplitColId][i];
3540  }
3541  if (MaxValue < IntCols[SplitColId][i]) {
3542  MaxValue = IntCols[SplitColId][i];
3543  }
3544  }
3545  }
3546 
3547  if (StartVal == TInt::Mn) StartVal = MinValue;
3548  if (EndVal == TInt::Mx) EndVal = MaxValue;
3549  }
3550 
3551  // initialize buckets
3552  if (JumpSize == 0) { NumBuckets = (EndVal - StartVal)/JumpSize + 1; }
3553  else { NumBuckets = (EndVal - StartVal)/JumpSize + 1; }
3554 
3555  InitRowIdBuckets(NumBuckets);
3556 
3557  // populate RowIdSets by computing the range of buckets for each row
3558  for (TInt i = 0; i < Next.Len(); i++) {
3559  if (Next[i] == Invalid) { continue; }
3560  int SplitVal = IntCols[SplitColId][i];
3561  if (SplitVal < StartVal || SplitVal > EndVal) { continue; }
3562  int RowVal = SplitVal - StartVal;
3563  if (JumpSize == 0) { // expanding windows
3564  MinBucket = RowVal/WindowSize;
3565  MaxBucket = NumBuckets-1;
3566  } else if (JumpSize == WindowSize) { // disjoint windows
3567  MinBucket = MaxBucket = RowVal/JumpSize;
3568  } else { // sliding windows
3569  if (RowVal < WindowSize) { MinBucket = 0; }
3570  else { MinBucket = (RowVal-WindowSize)/JumpSize + 1; }
3571  MaxBucket = RowVal/JumpSize;
3572  }
3573  for (TInt j = MinBucket; j <= MaxBucket; j++) { RowIdBuckets[j].Add(i); }
3574  }
3575 }
3576 
3577 void TTable::FillBucketsByInterval(TStr SplitAttr, TIntPrV SplitIntervals) {
3578  TInt SplitColId = GetColIdx(SplitAttr);
3579  int NumBuckets = SplitIntervals.Len();
3580  InitRowIdBuckets(NumBuckets);
3581 
3582  // populate RowIdSets by computing the range of buckets for each row
3583  for (TInt i = 0; i < Next.Len(); i++) {
3584  if (Next[i] == Invalid) { continue; }
3585  int SplitVal = IntCols[SplitColId][i];
3586  for (TInt j = 0; j < SplitIntervals.Len(); j++) {
3587  if (SplitVal >= SplitIntervals[j].Val1 && SplitVal < SplitIntervals[j].Val2) {
3588  RowIdBuckets[j].Add(i);
3589  }
3590  }
3591  }
3592 }
3593 
3595  //call BuildGraph on each row id set - parallelizable!
3596  TVec<PNEANet> GraphSequence;
3597  for (TInt i = 0; i < RowIdBuckets.Len(); i++) {
3598  if (RowIdBuckets[i].Len() == 0) { continue; }
3599  PNEANet PNet = BuildGraph(RowIdBuckets[i], AggrPolicy);
3600  GraphSequence.Add(PNet);
3601  }
3602 
3603  return GraphSequence;
3604 }
3605 
3607  CurrBucket = -1;
3608  this->AggrPolicy = AggrPolicy;
3609  return GetNextGraphFromSequence();
3610 }
3611 
3613  CurrBucket++;
3614  while (CurrBucket < RowIdBuckets.Len() && RowIdBuckets[CurrBucket].Len() == 0) {
3615  CurrBucket++;
3616  }
3617  if (CurrBucket >= RowIdBuckets.Len()) { return NULL; }
3619 }
3620 
3621 // Only integer SplitAttr supported
3622 // Setting JumpSize = WindowSize will give disjoint windows
3623 // Setting JumpSize < WindowSize will give sliding windows
3624 // Setting JumpSize > WindowSize will drop certain rows (currently not supported)
3625 // Setting JumpSize = 0 will give expanding windows (i.e. starting at 0 and ending at i*WindowSize)
3626 // To set the range of values of SplitAttr to be considered, use StartVal and EndVal (inclusive)
3627 // If StartVal == TInt.Mn, then the buckets will start from the min value of SplitAttr in the table.
3628 // If EndVal == TInt.Mx, then the buckets will end at the max value of SplitAttr in the table.
3629 TVec<PNEANet> TTable::ToGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal, TInt EndVal) {
3630  FillBucketsByWindow(SplitAttr, JumpSize, WindowSize, StartVal, EndVal);
3631  printf("buckets filled\n");
3632  return GetGraphsFromSequence(AggrPolicy);
3633 }
3634 
3635 TVec<PNEANet> TTable::ToVarGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals) {
3636  FillBucketsByInterval(SplitAttr, SplitIntervals);
3637  return GetGraphsFromSequence(AggrPolicy);
3638 }
3639 
3641  return ToGraphSequence(GroupAttr, AggrPolicy, TInt(1), TInt(1), TInt::Mn, TInt::Mx);
3642 }
3643 
3644 PNEANet TTable::ToGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal, TInt EndVal) {
3645  FillBucketsByWindow(SplitAttr, JumpSize, WindowSize, StartVal, EndVal);
3646  return GetFirstGraphFromSequence(AggrPolicy);
3647 }
3648 
3649 PNEANet TTable::ToVarGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals) {
3650  FillBucketsByInterval(SplitAttr, SplitIntervals);
3651  return GetFirstGraphFromSequence(AggrPolicy);
3652 }
3653 
3655  return ToGraphSequenceIterator(GroupAttr, AggrPolicy, TInt(1), TInt(1), TInt::Mn, TInt::Mx);
3656 }
3657 
3658 // calls to this must be preceded by a call to one of the above ToGraph*Iterator functions
3660  return GetNextGraphFromSequence();
3661 }
3662 
3664  return CurrBucket >= RowIdBuckets.Len() - 1;
3665 }
3666 
3668  Schema SR;
3669  SR.Add(TPair<TStr,TAttrType>("node_id",atInt));
3670 
3671  TStrV IntAttrNames;
3672  TStrV FltAttrNames;
3673  TStrV StrAttrNames;
3674 
3675  TNEANet::TNodeI NodeI = Network->BegNI();
3676  NodeI.GetIntAttrNames(IntAttrNames);
3677  NodeI.GetFltAttrNames(FltAttrNames);
3678  NodeI.GetStrAttrNames(StrAttrNames);
3679  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3680  SR.Add(TPair<TStr,TAttrType>(IntAttrNames[i],atInt));
3681  }
3682  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3683  SR.Add(TPair<TStr,TAttrType>(FltAttrNames[i],atFlt));
3684  }
3685  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3686  SR.Add(TPair<TStr,TAttrType>(StrAttrNames[i],atStr));
3687  }
3688 
3689  PTable T = New(SR, Context);
3690 
3691  TInt Cnt = 0;
3692  // populate table columns
3693  while (NodeI < Network->EndNI()) {
3694  T->IntCols[0].Add(NodeI.GetId());
3695  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3696  T->IntCols[i+1].Add(Network->GetIntAttrDatN(NodeI,IntAttrNames[i]));
3697  }
3698  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3699  T->FltCols[i].Add(Network->GetFltAttrDatN(NodeI,FltAttrNames[i]));
3700  }
3701  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3702  T->AddStrVal(i, Network->GetStrAttrDatN(NodeI,StrAttrNames[i]));
3703  }
3704  Cnt++;
3705  NodeI++;
3706  }
3707  // set number of rows and "Next" vector
3708  T->NumRows = Cnt;
3709  T->NumValidRows = T->NumRows;
3710  T->Next = TIntV(T->NumRows,0);
3711  for (TInt i = 0; i < T->NumRows-1; i++) {
3712  T->Next.Add(i+1);
3713  }
3714  T->LastValidRow = T->NumRows-1;
3715  T->Next.Add(Last);
3716  return T;
3717 }
3718 
3720  Schema SR;
3721  SR.Add(TPair<TStr,TAttrType>("edg_id",atInt));
3722  SR.Add(TPair<TStr,TAttrType>("src_id",atInt));
3723  SR.Add(TPair<TStr,TAttrType>("dst_id",atInt));
3724 
3725  TStrV IntAttrNames;
3726  TStrV FltAttrNames;
3727  TStrV StrAttrNames;
3728 
3729  TNEANet::TEdgeI EdgeI = Network->BegEI();
3730  EdgeI.GetIntAttrNames(IntAttrNames);
3731  EdgeI.GetFltAttrNames(FltAttrNames);
3732  EdgeI.GetStrAttrNames(StrAttrNames);
3733  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3734  SR.Add(TPair<TStr,TAttrType>(IntAttrNames[i],atInt));
3735  }
3736  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3737  SR.Add(TPair<TStr,TAttrType>(FltAttrNames[i],atFlt));
3738  }
3739  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3740  //printf("%s\n",StrAttrNames[i].CStr());
3741  SR.Add(TPair<TStr,TAttrType>(StrAttrNames[i],atStr));
3742  }
3743 
3744  PTable T = New(SR, Context);
3745 
3746  TInt Cnt = 0;
3747  // populate table columns
3748  while (EdgeI < Network->EndEI()) {
3749  T->IntCols[0].Add(EdgeI.GetId());
3750  T->IntCols[1].Add(EdgeI.GetSrcNId());
3751  T->IntCols[2].Add(EdgeI.GetDstNId());
3752  for (TInt i = 0; i < IntAttrNames.Len(); i++) {
3753  T->IntCols[i+3].Add(Network->GetIntAttrDatE(EdgeI,IntAttrNames[i]));
3754  }
3755  for (TInt i = 0; i < FltAttrNames.Len(); i++) {
3756  T->FltCols[i].Add(Network->GetFltAttrDatE(EdgeI,FltAttrNames[i]));
3757  }
3758  for (TInt i = 0; i < StrAttrNames.Len(); i++) {
3759  T->AddStrVal(i, Network->GetStrAttrDatE(EdgeI,StrAttrNames[i]));
3760  }
3761  Cnt++;
3762  EdgeI++;
3763  }
3764  // set number of rows and "Next" vector
3765  T->NumRows = Cnt;
3766  T->NumValidRows = T->NumRows;
3767  T->Next = TIntV(T->NumRows,0);
3768  for (TInt i = 0; i < T->NumRows-1; i++) {
3769  T->Next.Add(i+1);
3770  }
3771  T->LastValidRow = T->NumRows-1;
3772  T->Next.Add(Last);
3773  return T;
3774 }
3775 
3776 #ifdef GCC_ATOMIC
3778  Schema SR;
3779  SR.Add(TPair<TStr,TAttrType>("src_id",atInt));
3780  SR.Add(TPair<TStr,TAttrType>("dst_id",atInt));
3781 
3782  TNGraphMP::TEdgeI FirstEI = Network->BegEI();
3783  PTable T = New(SR, Context);
3784  TInt NumEdges = Network->GetEdges();
3785  TInt NumPartitions = omp_get_max_threads()*CHUNKS_PER_THREAD;
3786  TInt PartitionSize = NumEdges/NumPartitions;
3787  if (PartitionSize*NumPartitions < NumEdges) { NumPartitions++;}
3788 
3790  TVec<TEIPr> Partitions;
3791  TIntV PartitionSizes;
3792  TNGraphMP::TEdgeI currStart = FirstEI;
3793  TInt currCount = 0;
3794  while (FirstEI < Network->EndEI()){
3795  if (currCount == PartitionSize) {
3796  Partitions.Add(TEIPr(currStart, FirstEI));
3797  currStart = FirstEI;
3798  PartitionSizes.Add(currCount);
3799  //printf("added: %d\n", currCount.Val);
3800  currCount = 0;
3801  }
3802  //printf("%d\n", currCount.Val);
3803  FirstEI++;
3804  currCount++;
3805  }
3806  Partitions.Add(TEIPr(currStart, FirstEI));
3807  PartitionSizes.Add(currCount);
3808 
3809  T->ResizeTable(NumEdges);
3810  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
3811  for (int p = 0; p < Partitions.Len(); p++) {
3812  TNGraphMP::TEdgeI EdgeI = Partitions[p].GetVal1();
3813  TNGraphMP::TEdgeI EndI = Partitions[p].GetVal2();
3814  //printf("Thread = %d, p = %d, size = %d\n", omp_get_thread_num(), p, PartitionSizes[p].Val);
3815  int start = T->GetEmptyRowsStart(PartitionSizes[p]);
3816  while (EdgeI < EndI) {
3817  T->IntCols[0][start] = EdgeI.GetSrcNId();
3818  T->IntCols[1][start] = EdgeI.GetDstNId();
3819  EdgeI++;
3820  if (EdgeI < EndI) { T->Next[start] = start+1;}
3821  start++;
3822  }
3823  }
3824 
3825  Assert(T->NumRows == NumEdges);
3826  return T;
3827 }
3828 #endif // GCC_ATOMIC
3829 
3830 PTable TTable::GetFltNodePropertyTable(const PNEANet& Network, const TIntFltH& Property,
3831  const TStr& NodeAttrName, const TAttrType& NodeAttrType, const TStr& PropertyAttrName,
3832  TTableContext* Context) {
3833  Schema SR;
3834  // Determine type of node id
3835  SR.Add(TPair<TStr,TAttrType>(NodeAttrName,NodeAttrType));
3836  SR.Add(TPair<TStr,TAttrType>(PropertyAttrName,atFlt));
3837  PTable T = New(SR, Context);
3838  TInt NodeColIdx = T->GetColIdx(NodeAttrName);
3839  TInt Cnt = 0;
3840  // populate table columns
3841  for (TNEANet::TNodeI NodeI = Network->BegNI(); NodeI < Network->EndNI(); NodeI++) {
3842  switch (NodeAttrType) {
3843  case atInt:
3844  T->IntCols[NodeColIdx].Add(Network->GetIntAttrDatN(NodeI,NodeAttrName));
3845  break;
3846  case atFlt:
3847  T->FltCols[NodeColIdx].Add(Network->GetFltAttrDatN(NodeI,NodeAttrName));
3848  break;
3849  case atStr:
3850  T->AddStrVal(TInt(0), Network->GetStrAttrDatN(NodeI,NodeAttrName));
3851  break;
3852  }
3853  T->FltCols[0].Add(Property.GetDat(NodeI.GetId()));
3854  Cnt++;
3855  }
3856  // set number of rows and "Next" vector
3857  T->NumRows = Cnt;
3858  T->NumValidRows = T->NumRows;
3859  T->Next = TIntV(T->NumRows,0);
3860  for (TInt i = 0; i < T->NumRows-1; i++) {
3861  T->Next.Add(i+1);
3862  }
3863  T->LastValidRow = T->NumRows-1;
3864  T->Next.Add(Last);
3865  return T;
3866 }
3867 
3868 /*** Special Filters ***/
3869 PTable TTable::IsNextK(const TStr& OrderCol, TInt K, const TStr& GroupBy, const TStr& RankColName) {
3870  TStrV OrderBy;
3871  if (GroupBy.Empty()) {
3872  OrderBy.Add(OrderCol);
3873  } else {
3874  OrderBy.Add(GroupBy);
3875  OrderBy.Add(OrderCol);
3876  }
3877  if (RankColName.Empty()) {
3878  Order(OrderBy);
3879  } else {
3880  Order(OrderBy, RankColName, true);
3881  }
3882  TAttrType GroupByAttrType = GetColType(GroupBy);
3883  PTable T = InitializeJointTable(*this);
3884  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
3885  TInt Succ = RI.GetRowIdx();
3886  TBool OutOfGroup = false;
3887  for (TInt i = 0; i < K; i++) {
3888  Succ = Next[Succ];
3889  if (Succ == Last) { break; }
3890  switch (GroupByAttrType) {
3891  case atInt:
3892  if (GetIntVal(GroupBy, Succ) != RI.GetIntAttr(GroupBy)) { OutOfGroup = true; }
3893  break;
3894  case atFlt:
3895  if (GetFltVal(GroupBy, Succ) != RI.GetFltAttr(GroupBy)) { OutOfGroup = true; }
3896  break;
3897  case atStr:
3898  if (GetStrVal(GroupBy, Succ) != RI.GetStrAttr(GroupBy)) { OutOfGroup = true; }
3899  break;
3900  }
3901  if (OutOfGroup) { break; } // break out of inner for loop
3902  T->AddJointRow(*this, *this, RI.GetRowIdx(), Succ);
3903  }
3904  }
3905  return T;
3906 }
3907 
3909  printf("Total number of rows: %d\n", NumRows.Val);
3910  printf("Number of valid rows: %d\n", NumValidRows.Val);
3911  printf("Number of Int columns: %d\n", IntCols.Len());
3912  printf("Number of Flt columns: %d\n", FltCols.Len());
3913  printf("Number of Str columns: %d\n", StrColMaps.Len());
3914  TSize MemUsed = GetMemUsedKB();
3915  printf("Approximated size is %lu KB\n", MemUsed);
3916 }
3917 
3919  TSize ApproxSize = 0;
3920  ApproxSize += Next.GetMemUsed()/1000; // Next vector
3921  for(int i = 0; i < IntCols.Len(); i++){
3922  ApproxSize += IntCols[i].GetMemUsed()/1000;
3923  }
3924  for(int i = 0; i < FltCols.Len(); i++){
3925  ApproxSize += FltCols[i].GetMemUsed()/1000;
3926  }
3927  for(int i = 0; i < StrColMaps.Len(); i++){
3928  ApproxSize += StrColMaps[i].GetMemUsed()/1000;
3929  }
3930  ApproxSize += RowIdMap.GetMemUsed()/1000;
3931  ApproxSize += GroupIDMapping.GetMemUsed()/1000;
3932  ApproxSize += GroupMapping.GetMemUsed()/1000;
3933  ApproxSize += RowIdBuckets.GetMemUsed() / 1000;
3934  return ApproxSize;
3935 }
3936 
3938  printf("Number of strings in pool: ");
3939  printf("%d\n", Context->StringVals.Len());
3940  printf("Number of entries in hash table: ");
3941  printf("%d\n", Context->StringVals.Reserved());
3942  TSize MemUsed = GetContextMemUsedKB();
3943  printf("Approximate memory used for Context: %lu KB\n", MemUsed);
3944 }
3945 
3947  TSize ApproxSize = 0;
3948  ApproxSize += Context->StringVals.GetMemUsed();
3949  return ApproxSize;
3950 }
3951 
3952 void TTable::AddTable(const TTable& T) {
3953  //for (TInt c = 0; c < S.Len(); c++) {
3954  // if (S[c] != T.S[c]) { printf("(%s,%d) != (%s,%d)\n", S[c].Val1.CStr(), S[c].Val2, T.S[c].Val1.CStr(), T.S[c].Val2); TExcept::Throw("when adding tables, their schemas must match!"); }
3955  //}
3956  for (TInt c = 0; c < Sch.Len(); c++) {
3957  TStr ColName = GetSchemaColName(c);
3958  TInt ColIdx = GetColIdx(ColName);
3959  TInt TColIdx = ColName == IdColName ? T.GetColIdx(T.IdColName) : T.GetColIdx(ColName);
3960  if (TColIdx < 0) { TExcept::Throw("when adding a table, it must contain all columns of source table!"); }
3961  switch (GetColType(ColName)) {
3962  case atInt:
3963  IntCols[ColIdx].AddV(T.IntCols[TColIdx]);
3964  break;
3965  case atFlt:
3966  FltCols[ColIdx].AddV(T.FltCols[TColIdx]);
3967  break;
3968  case atStr:
3969  StrColMaps[ColIdx].AddV(T.StrColMaps[TColIdx]);
3970  break;
3971  }
3972  }
3973 
3974  TIntV TNext(T.Next);
3975  for (TInt i = 0; i < TNext.Len(); i++) {
3976  if (TNext[i] != Last && TNext[i] != Invalid) { TNext[i] += NumRows; }
3977  }
3978 
3979  Next.AddV(TNext);
3980  // checks if table is empty
3981  if (LastValidRow >= 0) {
3983  }
3985  NumRows += T.NumRows;
3987 }
3988 
3989 // returns physical indices of rows of given table present in our table
3990 // we assume that schema matches exactly (including index of id cols)
3991 void TTable::GetCollidingRows(const TTable& Table, THashSet<TInt>& Collisions) {
3992  TIntV UniqueVec;
3994  TStrV GroupBy;
3995 
3996  // indices of columns of each type
3997  TIntV IntGroupByCols;
3998  TIntV FltGroupByCols;
3999  TIntV StrGroupByCols;
4000 
4001  TInt IKLen, FKLen, SKLen;
4002 
4003  // check that schemas match
4004  for (TInt c = 0; c < Sch.Len(); c++) {
4005  if (Sch[c].Val1 == IdColName) {
4006  if (Table.Sch[c].Val1 != Table.GetIdColName()) {
4007  TExcept::Throw("GetCollidingRows: schemas do not match!");
4008  }
4009  continue;
4010  }
4011  if (Sch[c] != Table.Sch[c]) {
4012  printf("(%s,%d) != (%s,%d)\n", Sch[c].Val1.CStr(), Sch[c].Val2, Table.Sch[c].Val1.CStr(), Table.Sch[c].Val2);
4013  TExcept::Throw("GetCollidingRows: schemas do not match!");
4014  }
4015  GroupBy.Add(NormalizeColName(Sch[c].Val1));
4016  TPair<TAttrType, TInt> ColType = Table.GetColTypeMap(Sch[c].Val1);
4017  switch (ColType.Val1) {
4018  case atInt:
4019  IntGroupByCols.Add(ColType.Val2);
4020  break;
4021  case atFlt:
4022  FltGroupByCols.Add(ColType.Val2);
4023  break;
4024  case atStr:
4025  StrGroupByCols.Add(ColType.Val2);
4026  break;
4027  }
4028  }
4029 
4030  IKLen = IntGroupByCols.Len();
4031  FKLen = FltGroupByCols.Len();
4032  SKLen = StrGroupByCols.Len();
4033 
4034  // group rows of first table
4035  GroupAux(GroupBy, Grouping, true, "", false, UniqueVec, true);
4036 
4037  // find colliding rows of second table
4038  for (TRowIterator it = Table.BegRI(); it < Table.EndRI(); it++) {
4039  // read keys from row
4040  TIntV IKey(IKLen + SKLen, 0);
4041  TFltV FKey(FKLen, 0);
4042 
4043  // find group key
4044  for (TInt c = 0; c < IKLen; c++) {
4045  IKey.Add(it.GetIntAttr(IntGroupByCols[c]));
4046  }
4047  for (TInt c = 0; c < FKLen; c++) {
4048  FKey.Add(it.GetFltAttr(FltGroupByCols[c]));
4049  }
4050  for (TInt c = 0; c < SKLen; c++) {
4051  IKey.Add(it.GetStrMapById(StrGroupByCols[c]));
4052  }
4053  // look for group matching the key
4054  TGroupKey GroupKey = TGroupKey(IKey, FKey);
4055 
4056  TInt RowIdx = it.GetRowIdx();
4057  if (Grouping.IsKey(GroupKey)) {
4058  // row exists in first table
4059  Collisions.AddKey(RowIdx);
4060  }
4061  }
4062 }
4063 
4064 void TTable::StoreIntCol(const TStr& ColName, const TIntV& ColVals) {
4065  if (ColVals.Len() != NumRows) {
4066  printf("new column dimension must agree with number of rows\n");
4067  return;
4068  }
4069  AddSchemaCol(ColName, atInt);
4071  TInt ColIdx = IntCols.Len()-1;
4072  TInt i = 0;
4073  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
4074  IntCols[ColIdx][RI.GetRowIdx()] = ColVals[i];
4075  i++;
4076  }
4077  TInt L = IntCols.Len();
4078  AddColType(ColName, atInt, L-1);
4079 }
4080 
4081 void TTable::StoreFltCol(const TStr& ColName, const TFltV& ColVals) {
4082  if (ColVals.Len() != NumRows) {
4083  printf("new column dimension must agree with number of rows\n");
4084  return;
4085  }
4086  AddSchemaCol(ColName, atFlt);
4087  FltCols.Add(TFltV(NumRows));
4088  TInt ColIdx = FltCols.Len()-1;
4089  TInt i = 0;
4090  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
4091  FltCols[ColIdx][RI.GetRowIdx()] = ColVals[i];
4092  i++;
4093  }
4094  TInt L = FltCols.Len();
4095  AddColType(ColName, atFlt, L-1);
4096 }
4097 
4098 void TTable::StoreStrCol(const TStr& ColName, const TStrV& ColVals) {
4099  if (ColVals.Len() != NumRows) {
4100  printf("new column dimension must agree with number of rows\n");
4101  return;
4102  }
4103  AddSchemaCol(ColName, atStr);
4105  TInt ColIdx = FltCols.Len()-1;
4106  TInt i = 0;
4107  for (TRowIterator RI = BegRI(); RI < EndRI(); RI++) {
4108  TInt Key = Context->StringVals.GetKeyId(ColVals[i]);
4109  if (Key == -1) { Context->StringVals.AddKey(ColVals[i]); }
4110  StrColMaps[ColIdx][RI.GetRowIdx()] = Key;
4111  i++;
4112  }
4113  TInt L = StrColMaps.Len();
4114  AddColType(ColName, atStr, L-1);
4115 }
4116 
4118  if (LastValidRow >= 0) {
4120  }
4121  Next.Add(Last);
4123 
4124  NumRows++;
4125  NumValidRows++;
4126 }
4127 
4128 #ifdef GCC_ATOMIC
4129 void TTable::SetFltColToConstMP(TInt UpdateColIdx, TFlt DefaultFltVal){
4130  if(!GetMP()){ TExcept::Throw("Not Using MP!");}
4131  TIntPrV Partitions;
4132  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
4133  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
4134  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
4135  for (int i = 0; i < Partitions.Len(); i++){
4136  TRowIterator RowI(Partitions[i].GetVal1(), this);
4137  TRowIterator EndI(Partitions[i].GetVal2(), this);
4138  while(RowI < EndI){
4139  FltCols[UpdateColIdx][RowI.GetRowIdx()] = DefaultFltVal;
4140  RowI++;
4141  }
4142  }
4143 }
4144 
4145 // OP RS 2016/06/30: this wrapper function is required
4146 // for the code to compile on Mac OS X gcc 4.2.1
4148  return(__sync_bool_compare_and_swap(lock, 0, 1));
4149 }
4150 
4151 void TTable::UpdateFltFromTableMP(const TStr& KeyAttr, const TStr& UpdateAttr,
4152  const TTable& Table, const TStr& FKeyAttr, const TStr& ReadAttr,
4153  TFlt DefaultFltVal) {
4154  if (!GetMP()) {
4155  TExcept::Throw("Not Using MP!");
4156  }
4157 
4158  TAttrType KeyType = GetColType(KeyAttr);
4159  TAttrType FKeyType = Table.GetColType(FKeyAttr);
4160  if(KeyType != FKeyType){TExcept::Throw("Key Type Mismatch");}
4161  if(GetColType(UpdateAttr) != atFlt || Table.GetColType(ReadAttr) != atFlt){
4162  TExcept::Throw("Expecting Float values");
4163  }
4164  TStr NKeyAttr = NormalizeColName(KeyAttr);
4165  //TStr NUpdateAttr = NormalizeColName(UpdateAttr);
4166  //TStr NFKeyAttr = Table.NormalizeColName(FKeyAttr);
4167  //TStr NReadAttr = Table.NormalizeColName(ReadAttr);
4168  TInt UpdateColIdx = GetColIdx(UpdateAttr);
4169  TInt FKeyColIdx = GetColIdx(FKeyAttr);
4170  TInt ReadColIdx = GetColIdx(ReadAttr);
4171 
4172  // TODO: this should be a generic vector operation
4173  SetFltColToConstMP(UpdateColIdx, DefaultFltVal);
4174 
4175  TIntPrV Partitions;
4176  Table.GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
4177  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
4178  TIntV Locks(NumRows);
4179  Locks.PutAll(0); // need to parallelize this...
4180 
4181  switch (KeyType) {
4182  // TODO: add support for other cases of KeyType
4183  case atInt: {
4184  THashMP<TInt,TIntV> Grouping;
4185  // must use physical row ids
4186  GroupByIntColMP(NKeyAttr, Grouping, true);
4187  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD) // num_threads(1)
4188  for (int i = 0; i < Partitions.Len(); i++) {
4189  TRowIterator RowI(Partitions[i].GetVal1(), &Table);
4190  TRowIterator EndI(Partitions[i].GetVal2(), &Table);
4191  while (RowI < EndI) {
4192  TInt K = RowI.GetIntAttr(FKeyColIdx);
4193  if (Grouping.IsKey(K)) {
4194  TIntV& UpdateRows = Grouping.GetDat(K);
4195  for (int j = 0; j < UpdateRows.Len(); j++) {
4196  int* lock = &Locks[UpdateRows[j]].Val;
4197  // OP RS 2016/06/30: needed to define a wrapper function
4198  // for the code to compile on Mac OS X gcc 4.2.1
4199  //if (!__sync_bool_compare_and_swap(lock, 0, 1)) {
4200  if (!sync_bool_compare_and_swap(lock)) {
4201  continue;
4202  }
4203  //printf("key = %d, row = %d, old_score = %f\n", K.Val, j, UpdateRows[j].Val, FltCols[UpdateColIdx][UpdateRows[j]].Val);
4204  FltCols[UpdateColIdx][UpdateRows[j]] = RowI.GetFltAttr(ReadColIdx);
4205  //printf("key = %d, new_score = %f\n", K.Val, j, FltCols[UpdateColIdx][UpdateRows[j]].Val);
4206  } // end of for loop
4207  } // end of if statement
4208  RowI++;
4209  } // end of while loop
4210  } // end of for loop
4211  } // end of case atInt
4212  break;
4213  default:
4214  break;
4215  } // end of outer switch statement
4216 }
4217 #endif // GCC_ATOMIC
4218 
4219 void TTable::UpdateFltFromTable(const TStr& KeyAttr, const TStr& UpdateAttr, const TTable& Table,
4220  const TStr& FKeyAttr, const TStr& ReadAttr, TFlt DefaultFltVal){
4221  if(!IsColName(KeyAttr)){ TExcept::Throw("Bad KeyAttr parameter");}
4222  if(!IsColName(UpdateAttr)){ TExcept::Throw("Bad UpdateAttr parameter");}
4223  if(!Table.IsColName(FKeyAttr)){ TExcept::Throw("Bad FKeyAttr parameter");}
4224  if(!Table.IsColName(ReadAttr)){ TExcept::Throw("Bad ReadAttr parameter");}
4225 
4226 #ifdef GCC_ATOMIC
4227  if(GetMP()){
4228  UpdateFltFromTableMP(KeyAttr, UpdateAttr,Table, FKeyAttr, ReadAttr, DefaultFltVal);
4229  return;
4230  }
4231 #endif // GCC_ATOMIC
4232 
4233  TAttrType KeyType = GetColType(KeyAttr);
4234  TAttrType FKeyType = Table.GetColType(FKeyAttr);
4235  if(KeyType != FKeyType){TExcept::Throw("Key Type Mismatch");}
4236  if(GetColType(UpdateAttr) != atFlt || Table.GetColType(ReadAttr) != atFlt){
4237  TExcept::Throw("Expecting Float values");
4238  }
4239  TStr NKeyAttr = NormalizeColName(KeyAttr);
4240  TStr NUpdateAttr = NormalizeColName(UpdateAttr);
4241  TStr NFKeyAttr = Table.NormalizeColName(FKeyAttr);
4242  TStr NReadAttr = Table.NormalizeColName(ReadAttr);
4243  TInt UpdateColIdx = GetColIdx(UpdateAttr);
4244 
4245  for(TRowIterator iter = BegRI(); iter < EndRI(); iter++){
4246  FltCols[UpdateColIdx][iter.GetRowIdx()] = DefaultFltVal;
4247  }
4248 
4249  switch(KeyType) {
4250  // TODO: add support for other cases of KeyType
4251  case atInt: {
4252  TIntIntVH Grouping;
4253  GroupByIntCol(NKeyAttr, Grouping, TIntV(), true, true);
4254  for (TRowIterator RI = Table.BegRI(); RI < Table.EndRI(); RI++) {
4255  TInt K = RI.GetIntAttr(NFKeyAttr);
4256  if (Grouping.IsKey(K)) {
4257  TIntV& UpdateRows = Grouping.GetDat(K);
4258  for (int i = 0; i < UpdateRows.Len(); i++) {
4259  FltCols[UpdateColIdx][UpdateRows[i]] = RI.GetFltAttr(NReadAttr);
4260  } // end of for loop
4261  } // end of if statement
4262  } // end of for loop
4263  } // end of case atInt
4264  break;
4265  default:
4266  break;
4267  } // end of outer switch statement
4268 }
4269 
4270 
4271 // can ONLY be called when a table is being initialised (before IDs are allocated)
4272 void TTable::AddRow(const TRowIterator& RI) {
4273  for (TInt c = 0; c < Sch.Len(); c++) {
4274  TStr ColName = GetSchemaColName(c);
4275  if (ColName == IdColName) { continue; }
4276 
4277  TInt ColIdx = GetColIdx(ColName);
4278 
4279  switch (GetColType(ColName)) {
4280  case atInt:
4281  IntCols[ColIdx].Add(RI.GetIntAttr(ColName));
4282  break;
4283  case atFlt:
4284  FltCols[ColIdx].Add(RI.GetFltAttr(ColName));
4285  break;
4286  case atStr:
4287  StrColMaps[ColIdx].Add(RI.GetStrMapByName(ColName));
4288  break;
4289  }
4290  }
4292 }
4293 
4294 void TTable::AddRow(const TIntV& IntVals, const TFltV& FltVals, const TStrV& StrVals) {
4295  for (TInt c = 0; c < IntVals.Len(); c++) {
4296  IntCols[c].Add(IntVals[c]);
4297  }
4298  for (TInt c = 0; c < FltVals.Len(); c++) {
4299  FltCols[c].Add(FltVals[c]);
4300  }
4301  for (TInt c = 0; c < StrVals.Len(); c++) {
4302  AddStrVal(c, StrVals[c]);
4303  }
4305 }
4306 
4307 void TTable::ResizeTable(int RowCount) {
4308  if (RowCount == 0) {
4309  // initialize empty table
4310  NumValidRows = 0;
4313  }
4314  if (Next.Len() < RowCount) {
4315  TInt FltOffset = IntCols.Len();
4316  TInt StrOffset = FltOffset + FltCols.Len();
4317  TInt TotalCols = StrOffset + StrColMaps.Len();
4318 #ifdef USE_OPENMP
4319  #pragma omp parallel for schedule(static)
4320 #endif
4321  for (int i = 0; i < TotalCols+1; i++) {
4322  if (i < FltOffset) {
4323  IntCols[i].Reserve(RowCount, RowCount);
4324  } else if (i < StrOffset) {
4325  FltCols[i-FltOffset].Reserve(RowCount, RowCount);
4326  } else if (i < TotalCols) {
4327  StrColMaps[i-StrOffset].Reserve(RowCount, RowCount);
4328  } else {
4329  Next.Reserve(RowCount, RowCount);
4330  }
4331  }
4332  } else if (Next.Len() > RowCount) {
4333  TInt FltOffset = IntCols.Len();
4334  TInt StrOffset = FltOffset + FltCols.Len();
4335  TInt TotalCols = StrOffset + StrColMaps.Len();
4336 #ifdef USE_OPENMP
4337  #pragma omp parallel for schedule(static)
4338 #endif
4339  for (int i = 0; i < TotalCols+1; i++) {
4340  if (i < FltOffset) {
4341  IntCols[i].Trunc(RowCount);
4342  } else if (i < StrOffset) {
4343  FltCols[i-FltOffset].Trunc(RowCount);
4344  } else if (i < TotalCols) {
4345  StrColMaps[i-StrOffset].Trunc(RowCount);
4346  } else {
4347  Next.Trunc(RowCount);
4348  }
4349  }
4350  }
4351 }
4352 
4353 int TTable::GetEmptyRowsStart(int NewRows) {
4354  int start = -1;
4355 #ifdef USE_OPENMP
4356  #pragma omp critical
4357  {
4358 #endif
4359  start = NumRows;
4360  NumRows += NewRows;
4361  NumValidRows += NewRows;
4362  // To make this function thread-safe, the following call must be done before the
4363  // code enters parallel region.
4364  // ResizeTable(NumRows);
4365  Assert(NumRows <= Next.Len());
4366  if (LastValidRow >= 0) {Next[LastValidRow] = start;}
4367  LastValidRow = start+NewRows-1;
4368  Next[LastValidRow] = Last;
4369 #ifdef USE_OPENMP
4370  }
4371 #endif
4372  Assert (start >= 0);
4373  return start;
4374 }
4375 
4376 void TTable::AddSelectedRows(const TTable& Table, const TIntV& RowIDs) {
4377  int NewRows = RowIDs.Len();
4378  if (NewRows == 0) { return; }
4379  // this call should be thread-safe
4380  int start = GetEmptyRowsStart(NewRows);
4381  for (TInt r = 0; r < NewRows; r++) {
4382  TInt CurrRowIdx = RowIDs[r];
4383  for (TInt i = 0; i < Table.IntCols.Len(); i++) {
4384  IntCols[i][start+r] = Table.IntCols[i][CurrRowIdx];
4385  }
4386  for (TInt i = 0; i < Table.FltCols.Len(); i++) {
4387  FltCols[i][start+r] = Table.FltCols[i][CurrRowIdx];
4388  }
4389  for (TInt i = 0; i < Table.StrColMaps.Len(); i++) {
4390  StrColMaps[i][start+r] = Table.StrColMaps[i][CurrRowIdx];
4391  }
4392  }
4393  for (TInt r = 0; r < NewRows-1; r++) {
4394  Next[start+r] = start+r+1;
4395  }
4396 }
4397 
4398 void TTable::AddNRows(int NewRows, const TVec<TIntV>& IntColsP, const TVec<TFltV>& FltColsP, const TVec<TIntV>& StrColMapsP) {
4399  if (NewRows == 0) { return; }
4400  // this call should be thread-safe
4401  int start = GetEmptyRowsStart(NewRows);
4402  for (TInt r = 0; r < NewRows; r++) {
4403  for (TInt i = 0; i < IntColsP.Len(); i++) {
4404  IntCols[i][start+r] = IntColsP[i][r];
4405  }
4406  for (TInt i = 0; i < FltColsP.Len(); i++) {
4407  FltCols[i][start+r] = FltColsP[i][r];
4408  }
4409  for (TInt i = 0; i < StrColMapsP.Len(); i++) {
4410  StrColMaps[i][start+r] = StrColMapsP[i][r];
4411  }
4412  }
4413  for (TInt r = 0; r < NewRows-1; r++) {
4414  Next[start+r] = start+r+1;
4415  }
4416 }
4417 
4418 #ifdef USE_OPENMP
4419 void TTable::AddNJointRowsMP(const TTable& T1, const TTable& T2, const TVec<TIntPrV>& JointRowIDSet) {
4420  //double startFn = omp_get_wtime();
4421  int JointTableSize = 0;
4422  TIntV StartOffsets(JointRowIDSet.Len());
4423  for (int i = 0; i < JointRowIDSet.Len(); i++) {
4424  StartOffsets[i] = JointTableSize;
4425  JointTableSize += JointRowIDSet[i].Len();
4426  }
4427  if (JointTableSize == 0) {
4428  TExcept::Throw("Joint table is empty");
4429  }
4430  //double endOffsets = omp_get_wtime();
4431  //printf("Offsets time = %f\n",endOffsets-startFn);
4432  ResizeTable(JointTableSize);
4433  //double endResize = omp_get_wtime();
4434  //printf("Resize time = %f\n",endResize-endOffsets);
4435  NumRows = JointTableSize;
4436  NumValidRows = JointTableSize;
4437  Assert(NumRows <= Next.Len());
4438 
4439  TInt IntOffset = T1.IntCols.Len();
4440  TInt FltOffset = T1.FltCols.Len();
4441  TInt StrOffset = T1.StrColMaps.Len();
4442 
4443  TInt IdOffset = IntOffset + T2.IntCols.Len();
4444  RowIdMap.Clr();
4445  for (TInt IdCnt = 0; IdCnt < JointTableSize; IdCnt++) {
4446  RowIdMap.AddDat(IdCnt, IdCnt);
4447  }
4448 
4449  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
4450  for (int j = 0; j < JointRowIDSet.Len(); j++) {
4451  const TIntPrV& RowIDs = JointRowIDSet[j];
4452  int start = StartOffsets[j];
4453  int NewRows = RowIDs.Len();
4454  if (NewRows == 0) {continue;}
4455  for (TInt r = 0; r < NewRows; r++){
4456  TIntPr CurrRowIdPr = RowIDs[r];
4457  for(TInt i = 0; i < T1.IntCols.Len(); i++){
4458  IntCols[i][start+r] = T1.IntCols[i][CurrRowIdPr.GetVal1()];
4459  }
4460  for(TInt i = 0; i < T1.FltCols.Len(); i++){
4461  FltCols[i][start+r] = T1.FltCols[i][CurrRowIdPr.GetVal1()];
4462  }
4463  for(TInt i = 0; i < T1.StrColMaps.Len(); i++){
4464  StrColMaps[i][start+r] = T1.StrColMaps[i][CurrRowIdPr.GetVal1()];
4465  }
4466  for(TInt i = 0; i < T2.IntCols.Len(); i++){
4467  IntCols[i+IntOffset][start+r] = T2.IntCols[i][CurrRowIdPr.GetVal2()];
4468  }
4469  for(TInt i = 0; i < T2.FltCols.Len(); i++){
4470  FltCols[i+FltOffset][start+r] = T2.FltCols[i][CurrRowIdPr.GetVal2()];
4471  }
4472  for(TInt i = 0; i < T2.StrColMaps.Len(); i++){
4473  StrColMaps[i+StrOffset][start+r] = T2.StrColMaps[i][CurrRowIdPr.GetVal2()];
4474  }
4475  IntCols[IdOffset][start+r] = start+r;
4476  }
4477  for(TInt r = 0; r < NewRows; r++){
4478  Next[start+r] = start+r+1;
4479  }
4480  }
4481  LastValidRow = JointTableSize-1;
4482  Next[LastValidRow] = Last;
4483  //double endIterate = omp_get_wtime();
4484  //printf("Iterate time = %f\n",endIterate-endResize);
4485 }
4486 #endif // USE_OPENMP
4487 
4489  Schema NewSchema;
4490  for (TInt c = 0; c < Sch.Len(); c++) {
4491  if (Sch[c].Val1 != GetIdColName()) {
4492  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4493  }
4494  }
4495  PTable result = TTable::New(NewSchema, Context);
4496  result->AddTable(*this);
4497  result->UnionAllInPlace(Table);
4498  return result;
4499 }
4500 
4501 void TTable::UnionAllInPlace(const TTable& Table) {
4502  AddTable(Table);
4503  // TODO: For the moment, IDs are not initialized (to avoid having too many ID columns)
4504  //result->InitIds();
4505 }
4506 
4507 
4508 PTable TTable::Union(const TTable& Table) {
4509  Schema NewSchema;
4510  THashSet<TInt> Collisions;
4511  TStrV ColNames;
4512 
4513  for (TInt c = 0; c < Sch.Len(); c++) {
4514  if (Sch[c].Val1 != GetIdColName()) {
4515  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4516  ColNames.Add(Sch[c].Val1);
4517  }
4518  }
4519  PTable result = TTable::New(NewSchema, Context);
4520 
4521  GetCollidingRows(Table, Collisions);
4522 
4523  result->AddTable(*this);
4524 
4525  result->Unique(ColNames);
4526 
4527  // this part should be made faster by adding all the rows in one go
4528  for (TRowIterator it = Table.BegRI(); it < Table.EndRI(); it++) {
4529  if (!Collisions.IsKey(it.GetRowIdx())) {
4530  result->AddRow(it);
4531  }
4532  }
4533 
4534  // printf("this: %d %d, table: %d %d, result: %d %d\n",
4535  // this->GetNumRows().Val, this->GetNumValidRows().Val,
4536  // Table.GetNumRows().Val, Table.GetNumValidRows().Val,
4537  // result->GetNumRows().Val, result->GetNumValidRows().Val);
4538 
4539  result->InitIds();
4540  return result;
4541 }
4542 
4543 
4545  Schema NewSchema;
4546  THashSet<TInt> Collisions;
4547 
4548  for (TInt c = 0; c < Sch.Len(); c++) {
4549  if (Sch[c].Val1 != GetIdColName()) {
4550  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4551  }
4552  }
4553  PTable result = TTable::New(NewSchema, Context);
4554 
4555  GetCollidingRows(Table, Collisions);
4556 
4557  // this part should be made faster by adding all the rows in one go
4558  for (TRowIterator it = Table.BegRI(); it < Table.EndRI(); it++) {
4559  if (Collisions.IsKey(it.GetRowIdx())) {
4560  result->AddRow(it);
4561  }
4562  }
4563  result->InitIds();
4564  return result;
4565 }
4566 
4567 // TTable cannot be const because we will eventually call Table->GroupAux
4568 // as of now, GroupAux cannot be const because it modifies the table in some cases
4570  Schema NewSchema;
4571  THashSet<TInt> Collisions;
4572 
4573  for (TInt c = 0; c < Sch.Len(); c++) {
4574  if (Sch[c].Val1 != GetIdColName()) {
4575  NewSchema.Add(TPair<TStr,TAttrType>(Sch[c].Val1, Sch[c].Val2));
4576  }
4577  }
4578  PTable result = TTable::New(NewSchema, Context);
4579 
4580  Table.GetCollidingRows(*this, Collisions);
4581 
4582  // this part should be made faster by adding all the rows in one go
4583  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
4584  if (!Collisions.IsKey(it.GetRowIdx())) {
4585  result->AddRow(it);
4586  }
4587  }
4588  result->InitIds();
4589  return result;
4590 }
4591 
4592 PTable TTable::Project(const TStrV& ProjectCols) {
4593  Schema NewSchema;
4594  for (TInt c = 0; c < ProjectCols.Len(); c++) {
4595  if (!IsColName(ProjectCols[c])) { TExcept::Throw("no such column " + ProjectCols[c]); }
4596  NewSchema.Add(TPair<TStr,TAttrType>(ProjectCols[c], GetColType(ProjectCols[c])));
4597  }
4598 
4599  PTable result = TTable::New(NewSchema, Context);
4600  result->AddTable(*this);
4601  result->InitIds();
4602  return result;
4603 }
4604 
4605 TBool TTable::IsAttr(const TStr& Attr) {
4606  return IsColName(Attr);
4607 }
4608 
4609 TStr TTable::RenumberColName(const TStr& ColName) const {
4610  TStr NColName = ColName;
4611  if (NColName.GetCh(NColName.Len()-2) == '-') {
4612  NColName = NColName.GetSubStr(0,NColName.Len()-3);
4613  }
4614  TInt Conflicts = 0;
4615  for (TInt i = 0; i < Sch.Len(); i++) {
4616  if (NColName == Sch[i].Val1.GetSubStr(0, Sch[i].Val1.Len()-3)) {
4617  Conflicts++;
4618  }
4619  }
4620  Conflicts++;
4621  NColName = NColName + "-" + Conflicts.GetStr();
4622  return NColName;
4623 }
4624 
4625 TStr TTable::DenormalizeColName(const TStr& ColName) const {
4626  TStr DColName = ColName;
4627  if (DColName.Len() == 0) { return DColName; }
4628  if (DColName.GetCh(0) == '_') { return DColName; }
4629  if (DColName.GetCh(DColName.Len()-2) == '-') {
4630  DColName = DColName.GetSubStr(0,DColName.Len()-3);
4631  }
4632  TInt Conflicts = 0;
4633  for (TInt i = 0; i < Sch.Len(); i++) {
4634  if (DColName == Sch[i].Val1.GetSubStr(0, Sch[i].Val1.Len()-3)) {
4635  Conflicts++;
4636  }
4637  }
4638  if (Conflicts > 1) { return ColName; }
4639  else { return DColName; }
4640 }
4641 
4643  Schema DSch;
4644  for (TInt i = 0; i < Sch.Len(); i++) {
4645  DSch.Add(TPair<TStr, TAttrType>(DenormalizeColName(Sch[i].Val1), Sch[i].Val2));
4646  }
4647  return DSch;
4648 }
4649 
4650 void TTable::AddIntCol(const TStr& ColName) {
4651  AddSchemaCol(ColName, atInt);
4653  TInt L = IntCols.Len();
4654  AddColType(ColName, atInt, L-1);
4655 }
4656 
4657 void TTable::AddFltCol(const TStr& ColName) {
4658  AddSchemaCol(ColName, atFlt);
4659  FltCols.Add(TFltV(NumRows));
4660  TInt L = FltCols.Len();
4661  AddColType(ColName, atFlt, L-1);
4662 }
4663 
4664 void TTable::AddStrCol(const TStr& ColName) {
4665  AddSchemaCol(ColName, atStr);
4667  TInt L = StrColMaps.Len();
4668  AddColType(ColName, atStr, L-1);
4669 }
4670 
4671 void TTable::ClassifyAux(const TIntV& SelectedRows, const TStr& LabelName, const TInt& PositiveLabel, const TInt& NegativeLabel) {
4672  AddSchemaCol(LabelName, atInt);
4673  TInt LabelColIdx = IntCols.Len();
4674  AddColType(LabelName, atInt, LabelColIdx);
4676  for (TInt i = 0; i < NumRows; i++) {
4677  IntCols[LabelColIdx][i] = NegativeLabel;
4678  }
4679  for (TInt i = 0; i < SelectedRows.Len(); i++) {
4680  IntCols[LabelColIdx][SelectedRows[i]] = PositiveLabel;
4681  }
4682 }
4683 
4684 #ifdef USE_OPENMP
4685 void TTable::ColGenericOpMP(TInt ArgColIdx1, TInt ArgColIdx2, TAttrType ArgType1, TAttrType ArgType2, TInt ResColIdx, TArithOp op){
4686  TAttrType ResType = atFlt;
4687  if(ArgType1 == atInt && ArgType2 == atInt){ ResType = atInt;}
4688  TIntPrV Partitions;
4689  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
4690  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
4691  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
4692  for (int i = 0; i < Partitions.Len(); i++){
4693  TRowIterator RowI(Partitions[i].GetVal1(), this);
4694  TRowIterator EndI(Partitions[i].GetVal2(), this);
4695  while(RowI < EndI){
4696  if(ResType == atInt){
4697  TInt V1 = RowI.GetIntAttr(ArgColIdx1);
4698  TInt V2 = RowI.GetIntAttr(ArgColIdx2);
4699  if (op == aoAdd) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 + V2; }
4700  if (op == aoSub) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 - V2; }
4701  if (op == aoMul) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 * V2; }
4702  if (op == aoDiv) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 / V2; }
4703  if (op == aoMod) { IntCols[ResColIdx][RowI.GetRowIdx()] = V1 % V2; }
4704  if (op == aoMin) { IntCols[ResColIdx][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4705  if (op == aoMax) { IntCols[ResColIdx][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4706  } else{
4707  TFlt V1 = (ArgType1 == atInt) ? (TFlt)RowI.GetIntAttr(ArgColIdx1) : RowI.GetFltAttr(ArgColIdx1);
4708  TFlt V2 = (ArgType2 == atInt) ? (TFlt)RowI.GetIntAttr(ArgColIdx2) : RowI.GetFltAttr(ArgColIdx2);
4709  if (op == aoAdd) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 + V2; }
4710  if (op == aoSub) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 - V2; }
4711  if (op == aoMul) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 * V2; }
4712  if (op == aoDiv) { FltCols[ResColIdx][RowI.GetRowIdx()] = V1 / V2; }
4713  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4714  if (op == aoMin) { FltCols[ResColIdx][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4715  if (op == aoMax) { FltCols[ResColIdx][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4716  }
4717  RowI++;
4718  }
4719  }
4720 }
4721 #endif // USE_OPENMP
4722 
4723 /* Performs generic operations on two numeric attributes
4724  * Operation can be +, -, *, /, %, min or max
4725  * Alternative is to write separate functions for each operation
4726  * Branch prediction may result in as fast performance anyway ?
4727  *
4728  */
4729 void TTable::ColGenericOp(const TStr& Attr1, const TStr& Attr2, const TStr& ResAttr, TArithOp op) {
4730  // check if attributes are valid
4731  if (!IsAttr(Attr1)) TExcept::Throw("No attribute present: " + Attr1);
4732  if (!IsAttr(Attr2)) TExcept::Throw("No attribute present: " + Attr2);
4733  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
4734  TPair<TAttrType, TInt> Info2 = GetColTypeMap(Attr2);
4735  TAttrType Arg1Type = Info1.Val1;
4736  TAttrType Arg2Type = Info2.Val1;
4737  if (Arg1Type == atStr || Arg2Type == atStr) {
4738  TExcept::Throw("Only numeric columns supported in arithmetic operations.");
4739  }
4740  if(Arg1Type == atInt && Arg2Type == atFlt && ResAttr == ""){
4741  TExcept::Throw("Trying to write float values to an existing int-typed column");
4742  }
4743  // source column indices
4744  TInt ColIdx1 = Info1.Val2;
4745  TInt ColIdx2 = Info2.Val2;
4746 
4747  // destination column index
4748  TInt ColIdx3 = ColIdx1;
4749  // Create empty result column with type that of first attribute
4750  if (ResAttr != "") {
4751  if (Arg1Type == atInt && Arg2Type == atInt) {
4752  AddIntCol(ResAttr);
4753  }
4754  else {
4755  AddFltCol(ResAttr);
4756  }
4757  ColIdx3 = GetColIdx(ResAttr);
4758  }
4759 #ifdef USE_OPENMP
4760  if(GetMP()){
4761  ColGenericOpMP(ColIdx1, ColIdx2, Arg1Type, Arg2Type, ColIdx3, op);
4762  return;
4763  }
4764 #endif //USE_OPENMP
4765  TAttrType ResType = atFlt;
4766  if(Arg1Type == atInt && Arg2Type == atInt){ printf("hooray!\n"); ResType = atInt;}
4767  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
4768  //printf("%d %d %d %d\n", ColIdx1.Val, ColIdx2.Val, ColIdx3.Val, RowI.GetRowIdx().Val);
4769  if(ResType == atInt){
4770  TInt V1 = RowI.GetIntAttr(ColIdx1);
4771  TInt V2 = RowI.GetIntAttr(ColIdx2);
4772  if (op == aoAdd) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 + V2; }
4773  if (op == aoSub) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 - V2; }
4774  if (op == aoMul) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 * V2; }
4775  if (op == aoDiv) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 / V2; }
4776  if (op == aoMod) { IntCols[ColIdx3][RowI.GetRowIdx()] = V1 % V2; }
4777  if (op == aoMin) { IntCols[ColIdx3][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4778  if (op == aoMax) { IntCols[ColIdx3][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4779  } else{
4780  TFlt V1 = (Arg1Type == atInt) ? (TFlt)RowI.GetIntAttr(ColIdx1) : RowI.GetFltAttr(ColIdx1);
4781  TFlt V2 = (Arg2Type == atInt) ? (TFlt)RowI.GetIntAttr(ColIdx2) : RowI.GetFltAttr(ColIdx2);
4782  if (op == aoAdd) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 + V2; }
4783  if (op == aoSub) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 - V2; }
4784  if (op == aoMul) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 * V2; }
4785  if (op == aoDiv) { FltCols[ColIdx3][RowI.GetRowIdx()] = V1 / V2; }
4786  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4787  if (op == aoMin) { FltCols[ColIdx3][RowI.GetRowIdx()] = (V1 < V2) ? V1 : V2;}
4788  if (op == aoMax) { FltCols[ColIdx3][RowI.GetRowIdx()] = (V1 > V2) ? V1 : V2;}
4789  }
4790  }
4791 }
4792 
4793 void TTable::ColAdd(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4794  ColGenericOp(Attr1, Attr2, ResultAttrName, aoAdd);
4795 }
4796 
4797 void TTable::ColSub(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4798  ColGenericOp(Attr1, Attr2, ResultAttrName, aoSub);
4799 }
4800 
4801 void TTable::ColMul(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4802  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMul);
4803 }
4804 
4805 void TTable::ColDiv(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4806  ColGenericOp(Attr1, Attr2, ResultAttrName, aoDiv);
4807 }
4808 
4809 void TTable::ColMod(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4810  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMod);
4811 }
4812 
4813 void TTable::ColMin(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4814  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMin);
4815 }
4816 
4817 void TTable::ColMax(const TStr& Attr1, const TStr& Attr2, const TStr& ResultAttrName) {
4818  ColGenericOp(Attr1, Attr2, ResultAttrName, aoMax);
4819 }
4820 
4821 void TTable::ColGenericOp(const TStr& Attr1, TTable& Table, const TStr& Attr2, const TStr& ResAttr,
4822  TArithOp op, TBool AddToFirstTable) {
4823  // check if attributes are valid
4824  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
4825  if (!Table.IsAttr(Attr2)) { TExcept::Throw("No attribute present: " + Attr2); }
4826 
4827  if (NumValidRows != Table.NumValidRows) {
4828  TExcept::Throw("Tables do not have equal number of rows");
4829  }
4830 
4831  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
4832  TPair<TAttrType, TInt> Info2 = Table.GetColTypeMap(Attr2);
4833  TAttrType Arg1Type = Info1.Val1;
4834  TAttrType Arg2Type = Info2.Val1;
4835  if (Info1.Val1 == atStr || Info2.Val1 == atStr) {
4836  TExcept::Throw("Only numeric columns supported in arithmetic operations.");
4837  }
4838  if(Arg1Type == atInt && Arg2Type == atFlt && ResAttr == ""){
4839  TExcept::Throw("Trying to write float values to an existing int-typed column");
4840  }
4841  // source column indices
4842  TInt ColIdx1 = Info1.Val2;
4843  TInt ColIdx2 = Info2.Val2;
4844 
4845  // destination column index
4846  TInt ColIdx3 = AddToFirstTable ? ColIdx1 : ColIdx2;
4847 
4848  // Create empty result column in appropriate table with type that of first attribute
4849  if (ResAttr != "") {
4850  if (AddToFirstTable) {
4851  if (Arg1Type == atInt && Arg2Type == atInt) {
4852  AddIntCol(ResAttr);
4853  } else {
4854  AddFltCol(ResAttr);
4855  }
4856  ColIdx3 = GetColIdx(ResAttr);
4857  }
4858  else {
4859  if (Arg1Type == atInt && Arg2Type == atInt) {
4860  Table.AddIntCol(ResAttr);
4861  } else {
4862  Table.AddFltCol(ResAttr);
4863  }
4864  ColIdx3 = Table.GetColIdx(ResAttr);
4865  }
4866  }
4867 
4868  /*
4869  #ifdef USE_OPENMP
4870  if(GetMP()){
4871  ColGenericOpMP(Table, AddToFirstTable, ColIdx1, ColIdx2, Arg1Type, Arg2Type, ColIdx3, op);
4872  return;
4873  }
4874  #endif //USE_OPENMP
4875  */
4876 
4877  TRowIterator RI1, RI2;
4878  RI1 = BegRI();
4879  RI2 = Table.BegRI();
4880  TAttrType ResType = atFlt;
4881  if(Arg1Type == atInt && Arg2Type == atInt){ ResType = atInt;}
4882  while (RI1 < EndRI() && RI2 < Table.EndRI()) {
4883  if (ResType == atInt) {
4884  TInt V1 = RI1.GetIntAttr(ColIdx1);
4885  TInt V2 = RI2.GetIntAttr(ColIdx2);
4886  if (AddToFirstTable) {
4887  if (op == aoAdd) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 + V2; }
4888  if (op == aoSub) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 - V2; }
4889  if (op == aoMul) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 * V2; }
4890  if (op == aoDiv) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 / V2; }
4891  if (op == aoMod) { IntCols[ColIdx3][RI1.GetRowIdx()] = V1 % V2; }
4892  }
4893  else {
4894  if (op == aoAdd) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 + V2; }
4895  if (op == aoSub) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 - V2; }
4896  if (op == aoMul) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 * V2; }
4897  if (op == aoDiv) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 / V2; }
4898  if (op == aoMod) { Table.IntCols[ColIdx3][RI2.GetRowIdx()] = V1 % V2; }
4899  }
4900  } else {
4901  TFlt V1 = (Arg1Type == atInt) ? (TFlt)RI1.GetIntAttr(ColIdx1) : RI2.GetFltAttr(ColIdx1);
4902  TFlt V2 = (Arg2Type == atInt) ? (TFlt)RI1.GetIntAttr(ColIdx2) : RI2.GetFltAttr(ColIdx2);
4903  if (AddToFirstTable) {
4904  if (op == aoAdd) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 + V2; }
4905  if (op == aoSub) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 - V2; }
4906  if (op == aoMul) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 * V2; }
4907  if (op == aoDiv) { FltCols[ColIdx3][RI1.GetRowIdx()] = V1 / V2; }
4908  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4909  } else {
4910  if (op == aoAdd) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 + V2; }
4911  if (op == aoSub) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 - V2; }
4912  if (op == aoMul) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 * V2; }
4913  if (op == aoDiv) { Table.FltCols[ColIdx3][RI2.GetRowIdx()] = V1 / V2; }
4914  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
4915  }
4916  }
4917  RI1++;
4918  RI2++;
4919  }
4920 
4921  if (RI1 != EndRI() || RI2 != Table.EndRI()) {
4922  TExcept::Throw("ColGenericOp: Iteration error");
4923  }
4924 }
4925 
4926 void TTable::ColAdd(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4927  const TStr& ResultAttrName, TBool AddToFirstTable) {
4928  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoAdd, AddToFirstTable);
4929 }
4930 
4931 void TTable::ColSub(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4932  const TStr& ResultAttrName, TBool AddToFirstTable) {
4933  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoSub, AddToFirstTable);
4934 }
4935 
4936 void TTable::ColMul(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4937  const TStr& ResultAttrName, TBool AddToFirstTable) {
4938  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoMul, AddToFirstTable);
4939 }
4940 
4941 void TTable::ColDiv(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4942  const TStr& ResultAttrName, TBool AddToFirstTable) {
4943  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoDiv, AddToFirstTable);
4944 }
4945 
4946 void TTable::ColMod(const TStr& Attr1, TTable& Table, const TStr& Attr2,
4947  const TStr& ResultAttrName, TBool AddToFirstTable) {
4948  ColGenericOp(Attr1, Table, Attr2, ResultAttrName, aoMod, AddToFirstTable);
4949 }
4950 
4951 
4952 void TTable::ColGenericOp(const TStr& Attr1, const TFlt& Num, const TStr& ResAttr, TArithOp op, const TBool floatCast) {
4953  // check if attribute is valid
4954  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
4955 
4956  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
4957  TAttrType ArgType = Info1.Val1;
4958  if (ArgType == atStr) {
4959  TExcept::Throw("Only numeric columns supported in arithmetic operations.");
4960  }
4961  // source column index
4962  TInt ColIdx1 = Info1.Val2;
4963  // destination column index
4964  TInt ColIdx2 = ColIdx1;
4965 
4966  // Create empty result column with type that of first attribute
4967  TBool shouldCast = floatCast;
4968  if (ResAttr != "") {
4969  if ((ArgType == atInt) & !shouldCast) {
4970  AddIntCol(ResAttr);
4971  } else {
4972  AddFltCol(ResAttr);
4973  }
4974  ColIdx2 = GetColIdx(ResAttr);
4975  } else {
4976  // Cannot change type of existing attribute
4977  shouldCast = false;
4978  }
4979 
4980  #ifdef USE_OPENMP
4981  if(GetMP()){
4982  ColGenericOpMP(ColIdx1, ColIdx2, ArgType, Num, op, shouldCast);
4983  return;
4984  }
4985  #endif //USE_OPENMP
4986 
4987  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
4988  if ((ArgType == atInt) && !shouldCast) {
4989  TInt CurVal = RowI.GetIntAttr(ColIdx1);
4990  TInt Val = static_cast<int>(Num);
4991  if (op == aoAdd) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Val; }
4992  if (op == aoSub) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Val; }
4993  if (op == aoMul) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Val; }
4994  if (op == aoDiv) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Val; }
4995  if (op == aoMod) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal % Val; }
4996  }
4997  else {
4998  TFlt CurVal = (ArgType == atFlt) ? RowI.GetFltAttr(ColIdx1) : (TFlt) RowI.GetIntAttr(ColIdx1);
4999  if (op == aoAdd) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Num; }
5000  if (op == aoSub) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Num; }
5001  if (op == aoMul) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Num; }
5002  if (op == aoDiv) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Num; }
5003  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
5004  }
5005  }
5006 }
5007 
5008 #ifdef USE_OPENMP
5009 void TTable::ColGenericOpMP(const TInt& ColIdx1, const TInt& ColIdx2, TAttrType ArgType, const TFlt& Num, TArithOp op, TBool ShouldCast){
5010  TIntPrV Partitions;
5011  GetPartitionRanges(Partitions, omp_get_max_threads()*CHUNKS_PER_THREAD);
5012  TInt PartitionSize = Partitions[0].GetVal2()-Partitions[0].GetVal1()+1;
5013  #pragma omp parallel for schedule(dynamic, CHUNKS_PER_THREAD)
5014  for (int i = 0; i < Partitions.Len(); i++){
5015  TRowIterator RowI(Partitions[i].GetVal1(), this);
5016  TRowIterator EndI(Partitions[i].GetVal2(), this);
5017  while(RowI < EndI){
5018  if ((ArgType == atInt) && !ShouldCast) {
5019  TInt CurVal = RowI.GetIntAttr(ColIdx1);
5020  TInt Val = static_cast<int>(Num);
5021  if (op == aoAdd) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Val; }
5022  if (op == aoSub) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Val; }
5023  if (op == aoMul) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Val; }
5024  if (op == aoDiv) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Val; }
5025  if (op == aoMod) { IntCols[ColIdx2][RowI.GetRowIdx()] = CurVal % Val; }
5026  } else {
5027  TFlt CurVal = (ArgType == atFlt) ? RowI.GetFltAttr(ColIdx1) : (TFlt) RowI.GetIntAttr(ColIdx1);
5028  if (op == aoAdd) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal + Num; }
5029  if (op == aoSub) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal - Num; }
5030  if (op == aoMul) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal * Num; }
5031  if (op == aoDiv) { FltCols[ColIdx2][RowI.GetRowIdx()] = CurVal / Num; }
5032  if (op == aoMod) { TExcept::Throw("Cannot find modulo for float columns"); }
5033  }
5034  RowI++;
5035  }
5036  }
5037 }
5038 #endif
5039 
5040 void TTable::ColAdd(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5041  ColGenericOp(Attr1, Num, ResultAttrName, aoAdd, floatCast);
5042 }
5043 
5044 void TTable::ColSub(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5045  ColGenericOp(Attr1, Num, ResultAttrName, aoSub, floatCast);
5046 }
5047 
5048 void TTable::ColMul(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5049  ColGenericOp(Attr1, Num, ResultAttrName, aoMul, floatCast);
5050 }
5051 
5052 void TTable::ColDiv(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5053  ColGenericOp(Attr1, Num, ResultAttrName, aoDiv, floatCast);
5054 }
5055 
5056 void TTable::ColMod(const TStr& Attr1, const TFlt& Num, const TStr& ResultAttrName, const TBool floatCast) {
5057  ColGenericOp(Attr1, Num, ResultAttrName, aoMod, floatCast);
5058 }
5059 
5060 void TTable::ColConcat(const TStr& Attr1, const TStr& Attr2, const TStr& Sep, const TStr& ResAttr) {
5061  // check if attributes are valid
5062  if (!IsAttr(Attr1)) TExcept::Throw("No attribute present: " + Attr1);
5063  if (!IsAttr(Attr2)) TExcept::Throw("No attribute present: " + Attr2);
5064 
5065  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
5066  TPair<TAttrType, TInt> Info2 = GetColTypeMap(Attr2);
5067 
5068  if (Info1.Val1 != atStr || Info2.Val1 != atStr) {
5069  TExcept::Throw("Only string columns supported in concat.");
5070  }
5071 
5072  // source column indices
5073  TInt ColIdx1 = Info1.Val2;
5074  TInt ColIdx2 = Info2.Val2;
5075 
5076  // destination column index
5077  TInt ColIdx3 = ColIdx1;
5078 
5079  // Create empty result column with type that of first attribute
5080  if (ResAttr != "") {
5081  AddStrCol(ResAttr);
5082  ColIdx3 = GetColIdx(ResAttr);
5083  }
5084 
5085  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5086  TStr CurVal1 = RowI.GetStrAttr(ColIdx1);
5087  TStr CurVal2 = RowI.GetStrAttr(ColIdx2);
5088  TStr NewVal = CurVal1 + Sep + CurVal2;
5089  TInt Key = TInt(Context->StringVals.AddKey(NewVal));
5090  StrColMaps[ColIdx3][RowI.GetRowIdx()] = Key;
5091  }
5092 }
5093 
5094 void TTable::ColConcat(const TStr& Attr1, TTable& Table, const TStr& Attr2, const TStr& Sep,
5095  const TStr& ResAttr, TBool AddToFirstTable) {
5096  // check if attributes are valid
5097  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
5098  if (!Table.IsAttr(Attr2)) { TExcept::Throw("No attribute present: " + Attr2); }
5099 
5100  if (NumValidRows != Table.NumValidRows) {
5101  TExcept::Throw("Tables do not have equal number of rows");
5102  }
5103 
5104  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
5105  TPair<TAttrType, TInt> Info2 = Table.GetColTypeMap(Attr2);
5106 
5107  if (Info1.Val1 != atStr || Info2.Val1 != atStr) {
5108  TExcept::Throw("Only string columns supported in concat.");
5109  }
5110 
5111  // source column indices
5112  TInt ColIdx1 = Info1.Val2;
5113  TInt ColIdx2 = Info2.Val2;
5114 
5115  // destination column index
5116  TInt ColIdx3 = ColIdx1;
5117 
5118  if (!AddToFirstTable) {
5119  ColIdx3 = ColIdx2;
5120  }
5121 
5122  // Create empty result column in appropriate table with type that of first attribute
5123  if (ResAttr != "") {
5124  if (AddToFirstTable) {
5125  AddStrCol(ResAttr);
5126  ColIdx3 = GetColIdx(ResAttr);
5127  }
5128  else {
5129  Table.AddStrCol(ResAttr);
5130  ColIdx3 = Table.GetColIdx(ResAttr);
5131  }
5132  }
5133 
5134  TRowIterator RI1, RI2;
5135 
5136  RI1 = BegRI();
5137  RI2 = Table.BegRI();
5138 
5139  while (RI1 < EndRI() && RI2 < Table.EndRI()) {
5140  TStr CurVal1 = RI1.GetStrAttr(ColIdx1);
5141  TStr CurVal2 = RI2.GetStrAttr(ColIdx2);
5142  TStr NewVal = CurVal1 + Sep + CurVal2;
5143  TInt Key = TInt(Context->StringVals.AddKey(NewVal));
5144  if (AddToFirstTable) {
5145  StrColMaps[ColIdx3][RI1.GetRowIdx()] = Key;
5146  }
5147  else {
5148  Table.StrColMaps[ColIdx3][RI2.GetRowIdx()] = Key;
5149  }
5150  RI1++;
5151  RI2++;
5152  }
5153 
5154  if (RI1 != EndRI() || RI2 != Table.EndRI()) {
5155  TExcept::Throw("ColGenericOp: Iteration error");
5156  }
5157 }
5158 
5159 void TTable::ColConcatConst(const TStr& Attr1, const TStr& Val, const TStr& Sep, const TStr& ResAttr) {
5160  // check if attribute is valid
5161  if (!IsAttr(Attr1)) { TExcept::Throw("No attribute present: " + Attr1); }
5162 
5163  TPair<TAttrType, TInt> Info1 = GetColTypeMap(Attr1);
5164 
5165  if (Info1.Val1 != atStr) {
5166  TExcept::Throw("Only string columns supported in concat.");
5167  }
5168 
5169  // source column index
5170  TInt ColIdx1 = Info1.Val2;
5171 
5172  // destination column index
5173  TInt ColIdx2 = ColIdx1;
5174 
5175  // Create empty result column with type that of first attribute
5176  if (ResAttr != "") {
5177  AddStrCol(ResAttr);
5178  ColIdx2 = GetColIdx(ResAttr);
5179  }
5180 
5181  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5182  TStr CurVal = RowI.GetStrAttr(ColIdx1);
5183  TStr NewVal = CurVal + Sep + Val;
5184  TInt Key = TInt(Context->StringVals.AddKey(NewVal));
5185  StrColMaps[ColIdx2][RowI.GetRowIdx()] = Key;
5186  }
5187 }
5188 
5189 void TTable::ReadIntCol(const TStr& ColName, TIntV& Result) const{
5190  if (!IsColName(ColName)) { TExcept::Throw("no such column " + ColName); }
5191  if (GetColType(ColName) != atInt) { TExcept::Throw("not an integer column " + ColName); }
5192  TInt ColId = GetColIdx(ColName);
5193  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
5194  Result.Add(it.GetIntAttr(ColId));
5195  }
5196 }
5197 
5198 void TTable::ReadFltCol(const TStr& ColName, TFltV& Result) const{
5199  if (!IsColName(ColName)) { TExcept::Throw("no such column " + ColName); }
5200  if (GetColType(ColName) != atFlt) { TExcept::Throw("not a floating point column " + ColName); }
5201  TInt ColId = GetColIdx(ColName);
5202  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
5203  Result.Add(it.GetFltAttr(ColId));
5204  }
5205 }
5206 
5207 void TTable::ReadStrCol(const TStr& ColName, TStrV& Result) const{
5208  if (!IsColName(ColName)) { TExcept::Throw("no such column " + ColName); }
5209  if (GetColType(ColName) != atStr) { TExcept::Throw("not a string column " + ColName); }
5210  TInt ColId = GetColIdx(ColName);
5211  for (TRowIterator it = BegRI(); it < EndRI(); it++) {
5212  Result.Add(it.GetStrAttr(ColId));
5213  }
5214 }
5215 
5216 void TTable::ProjectInPlace(const TStrV& ProjectCols) {
5217  TStrV NProjectCols = NormalizeColNameV(ProjectCols);
5218  for (TInt c = 0; c < NProjectCols.Len(); c++) {
5219  if (!IsColName(NProjectCols[c])) { TExcept::Throw("no such column " + NProjectCols[c]); }
5220  }
5221  THashSet<TStr> ProjectColsSet = THashSet<TStr>(NProjectCols);
5222  // Delete the column vectors
5223  for (TInt i = Sch.Len() - 1; i >= 0; i--) {
5224  TStr ColName = GetSchemaColName(i);
5225  if (ProjectColsSet.IsKey(ColName) || ColName == IdColName) { continue; }
5226  TAttrType ColType = GetSchemaColType(i);
5227  TInt ColId = GetColIdx(ColName);
5228  switch (ColType) {
5229  case atInt:
5230  IntCols.Del(ColId);
5231  break;
5232  case atFlt:
5233  FltCols.Del(ColId);
5234  break;
5235  case atStr:
5236  StrColMaps.Del(ColId);
5237  break;
5238  }
5239  }
5240 
5241  // Rebuild the ColTypeMap with new indexes of the column vectors
5242  TInt IntColCnt = 0;
5243  TInt FltColCnt = 0;
5244  TInt StrColCnt = 0;
5245  ColTypeMap.Clr();
5246  for (TInt i = 0; i < Sch.Len(); i++) {
5247  TStr ColName = GetSchemaColName(i);
5248  if (!ProjectColsSet.IsKey(ColName) && ColName != IdColName) { continue; }
5249  TAttrType ColType = GetSchemaColType(i);
5250  switch (ColType) {
5251  case atInt:
5252  AddColType(ColName, atInt, IntColCnt);
5253  IntColCnt++;
5254  break;
5255  case atFlt:
5256  AddColType(ColName, atFlt, FltColCnt);
5257  FltColCnt++;
5258  break;
5259  case atStr:
5260  AddColType(ColName, atStr, StrColCnt);
5261  StrColCnt++;
5262  break;
5263  }
5264  }
5265 
5266  // Update schema
5267  for (TInt i = Sch.Len() - 1; i >= 0; i--) {
5268  TStr ColName = GetSchemaColName(i);
5269  if (ProjectColsSet.IsKey(ColName) || ColName == IdColName) { continue; }
5270  Sch.Del(i);
5271  }
5272 }
5273 
5274 TInt TTable::CompareKeyVal(const TInt& K1, const TInt& V1, const TInt& K2, const TInt& V2) {
5275  // if (K1 == K2) {
5276  // if (V1 < V2) { return -1; }
5277  // else if (V1 > V2) { return 1; }
5278  // else return 0;
5279  // }
5280  // if (K1 < K2) { return -1; }
5281  // else { return 1; }
5282 
5283  if (K1 == K2) { return V1 - V2; }
5284  else { return K1 - K2; }
5285 }
5286 
5288  TInt j;
5289  for (j = Start; j < End; j++) {
5290  if (CompareKeyVal(Key[j], Val[j], Key[j+1], Val[j+1]) > 0) {
5291  break;
5292  }
5293  }
5294  if (j >= End) { return 0; }
5295  else { return 1; }
5296 }
5297 
5298 void TTable::ISortKeyVal(TIntV& Key, TIntV& Val, TInt Start, TInt End) {
5299  if (Start < End) {
5300  for (TInt i = Start+1; i <= End; i++) {
5301  TInt K = Key[i];
5302  TInt V = Val[i];
5303  TInt j = i;
5304  while ((Start < j) && (CompareKeyVal(Key[j-1], Val[j-1], K, V) > 0)) {
5305  Key[j] = Key[j-1];
5306  Val[j] = Val[j-1];
5307  j--;
5308  }
5309  Key[j] = K;
5310  Val[j] = V;
5311  }
5312  }
5313 }
5314 
5315 TInt TTable::GetPivotKeyVal(TIntV& Key, TIntV& Val, TInt Start, TInt End) {
5316  TInt L = End - Start + 1;
5317  const TInt Idx1 = Start + TInt::GetRnd(L);
5318  const TInt Idx2 = Start + TInt::GetRnd(L);
5319  const TInt Idx3 = Start + TInt::GetRnd(L);
5320  if (CompareKeyVal(Key[Idx1], Val[Idx1], Key[Idx2], Val[Idx2]) < 0) {
5321  if (CompareKeyVal(Key[Idx2], Val[Idx2], Key[Idx3], Val[Idx3]) < 0) { return Idx2; }
5322  if (CompareKeyVal(Key[Idx1], Val[Idx1], Key[Idx3], Val[Idx3]) < 0) { return Idx3; }
5323  return Idx1;
5324  } else {
5325  if (CompareKeyVal(Key[Idx3], Val[Idx3], Key[Idx2], Val[Idx2]) < 0) { return Idx2; }
5326  if (CompareKeyVal(Key[Idx3], Val[Idx3], Key[Idx1], Val[Idx1]) < 0) { return Idx3; }
5327  return Idx1;
5328  }
5329 }
5330 
5331 
5333  TInt Pivot = GetPivotKeyVal(Key, Val, Start, End);
5334  //printf("Pivot=%d\n", Pivot.Val);
5335  TInt PivotKey = Key[Pivot];
5336  TInt PivotVal = Val[Pivot];
5337  Key.Swap(Pivot, End);
5338  Val.Swap(Pivot, End);
5339  TInt StoreIdx = Start;
5340  for (TInt i = Start; i < End; i++) {
5341  //printf("%d %d %d %d\n", Key[i].Val, Val[i].Val, PivotKey.Val, PivotVal.Val);
5342  if (CompareKeyVal(Key[i], Val[i], PivotKey, PivotVal) <= 0) {
5343  Key.Swap(i, StoreIdx);
5344  Val.Swap(i, StoreIdx);
5345  StoreIdx++;
5346  }
5347  }
5348  //printf("StoreIdx=%d\n", StoreIdx.Val);
5349  // move pivot value to its place
5350  Key.Swap(StoreIdx, End);
5351  Val.Swap(StoreIdx, End);
5352  return StoreIdx;
5353 }
5354 
5355 void TTable::QSortKeyVal(TIntV& Key, TIntV& Val, TInt Start, TInt End) {
5356  //printf("Thread=%d, Start=%d, End=%d\n", omp_get_thread_num(), Start.Val, End.Val);
5357  TInt L = End-Start;
5358  if (L <= 0) { return; }
5359  if (CheckSortedKeyVal(Key, Val, Start, End) == 0) { return; }
5360 
5361  if (L <= 20) { ISortKeyVal(Key, Val, Start, End); }
5362  else {
5363  TInt Pivot = PartitionKeyVal(Key, Val, Start, End);
5364 
5365  if (Pivot > End) { return; }
5366  if (L <= 500000) {
5367  QSortKeyVal(Key, Val, Start, Pivot-1);
5368  QSortKeyVal(Key, Val, Pivot+1, End);
5369  } else {
5370 #ifdef USE_OPENMP
5371 #ifndef GLib_WIN32
5372  #pragma omp task untied shared(Key, Val)
5373 #endif
5374 #endif
5375  { QSortKeyVal(Key, Val, Start, Pivot-1); }
5376 
5377 #ifdef USE_OPENMP
5378 #ifndef GLib_WIN32
5379  #pragma omp task untied shared(Key, Val)
5380 #endif
5381 #endif
5382  { QSortKeyVal(Key, Val, Pivot+1, End); }
5383  }
5384  }
5385 }
5386 
5387 TIntV TTable::GetIntRowIdxByVal(const TStr& ColName, const TInt& Val) const {
5388 
5389  if (IntColIndexes.IsKey(ColName)) {
5390  THash<TInt, TIntV> ColIndex = IntColIndexes.GetDat(ColName);
5391  if (ColIndex.IsKey(Val)) {
5392  return ColIndex.GetDat(Val);
5393  }
5394  else {
5395  TIntV Empty;
5396  return Empty;
5397  }
5398  }
5399  TIntV ToReturn;
5400  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5401  TInt ValAtRow = RowI.GetIntAttr(ColName);
5402  if ( Val == ValAtRow) {
5403  ToReturn.Add(RowI.GetRowIdx());
5404  }
5405  }
5406  return ToReturn;
5407 }
5408 TIntV TTable::GetStrRowIdxByMap(const TStr& ColName, const TInt& Map) const {
5409 
5410  if (StrMapColIndexes.IsKey(ColName)) {
5411  THash<TInt, TIntV> ColIndex = StrMapColIndexes.GetDat(ColName);
5412  if (ColIndex.IsKey(Map)) {
5413  return ColIndex.GetDat(Map);
5414  }
5415  else {
5416  TIntV Empty;
5417  return Empty;
5418  }
5419  }
5420  TIntV ToReturn;
5421  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5422  TInt MapAtRow = RowI.GetStrMapByName(ColName);
5423  if ( Map == MapAtRow) {
5424  ToReturn.Add(RowI.GetRowIdx());
5425  }
5426  }
5427  return ToReturn;
5428 }
5429 
5430 TIntV TTable::GetFltRowIdxByVal(const TStr& ColName, const TFlt& Val) const {
5431 
5432  if (FltColIndexes.IsKey(ColName)) {
5433  THash<TFlt, TIntV> ColIndex = FltColIndexes.GetDat(ColName);
5434  if (ColIndex.IsKey(Val)) {
5435  return ColIndex.GetDat(Val);
5436  }
5437  else {
5438  TIntV Empty;
5439  return Empty;
5440  }
5441  }
5442 
5443  TIntV ToReturn;
5444  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5445  TFlt ValAtRow = RowI.GetFltAttr(ColName);
5446  if ( Val == ValAtRow) {
5447  ToReturn.Add(RowI.GetRowIdx());
5448  }
5449  }
5450  return ToReturn;
5451 }
5452 
5454 
5455  THash<TInt, TIntV> NewIndex;
5456  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5457  TInt ValAtRow = RowI.GetIntAttr(ColName);
5458  TInt RowIdx = RowI.GetRowIdx();
5459  if (NewIndex.IsKey(ValAtRow)) {
5460  TIntV Curr_V = NewIndex.GetDat(ValAtRow);
5461  Curr_V.Add(RowIdx);
5462  }
5463  else {
5464  TIntV New_V;
5465  New_V.Add(RowIdx);
5466  NewIndex.AddDat(ValAtRow, New_V);
5467  }
5468  }
5469  IntColIndexes.AddDat(ColName, NewIndex);
5470  return 0;
5471 }
5473 
5474  THash<TFlt, TIntV> NewIndex;
5475  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5476  TFlt ValAtRow = RowI.GetFltAttr(ColName);
5477  TInt RowIdx = RowI.GetRowIdx();
5478  if (NewIndex.IsKey(ValAtRow)) {
5479  TIntV Curr_V = NewIndex.GetDat(ValAtRow);
5480  Curr_V.Add(RowIdx);
5481  }
5482  else {
5483  TIntV New_V;
5484  New_V.Add(RowIdx);
5485  NewIndex.AddDat(ValAtRow, New_V);
5486  }
5487  }
5488  FltColIndexes.AddDat(ColName, NewIndex);
5489  return 0;
5490 }
5492  THash<TInt, TIntV> NewIndex;
5493  for (TRowIterator RowI = BegRI(); RowI < EndRI(); RowI++) {
5494  TInt MapAtRow = RowI.GetStrMapByName(ColName);
5495  TInt RowIdx = RowI.GetRowIdx();
5496  if (NewIndex.IsKey(MapAtRow)) {
5497  TIntV Curr_V = NewIndex.GetDat(MapAtRow);
5498  Curr_V.Add(RowIdx);
5499  }
5500  else {
5501  TIntV New_V;
5502  New_V.Add(RowIdx);
5503  NewIndex.AddDat(MapAtRow, New_V);
5504  }
5505  }
5506  StrMapColIndexes.AddDat(ColName, NewIndex);
5507  return 0;
5508 }
Definition: table.h:268
TSize GetMemUsedKB()
Returns approximate memory used by table in [KB].
Definition: table.cpp:3918
void ThresholdJoinInputCorrectness(const TStr &KeyCol1, const TStr &JoinCol1, const TTable &Table, const TStr &KeyCol2, const TStr &JoinCol2)
Definition: table.cpp:2458
void AddSchemaCol(const TStr &ColName, TAttrType ColType)
Adds column with name ColName and type ColType to the schema.
Definition: table.h:652
TFlt GetFltAttr(TInt ColIdx) const
Returns value of floating point attribute specified by float column index for current row...
Definition: table.cpp:159
TPair< TInt, TInt > TIntPr
Definition: ds.h:83
TInt RequestIndexInt(const TStr &ColName)
Creates Index for Int Column ColName.
Definition: table.cpp:5453
Definition: table.h:268
TBool IsLastGraphOfSequence()
Checks if the end of the graph sequence is reached.
Definition: table.cpp:3663
TBool IsAttr(const TStr &Attr)
Checks if Attr is an attribute of this table schema.
Definition: table.cpp:4605
void SetFltVal(TStr VarName, TFlt VarVal)
Set flt variable value in the predicate or all the children that use it.
Definition: table.h:100
void Order(const TStrV &OrderBy, TStr OrderColName="", TBool ResetRankByMSC=false, TBool Asc=true)
Orders the rows according to the values in columns of OrderBy (in descending lexicographic order)...
Definition: table.cpp:3220
void FillBucketsByInterval(TStr SplitAttr, TIntPrV SplitIntervals)
Fills RowIdBuckets with sets of row ids.
Definition: table.cpp:3577
bool Next()
Loads next line from the input file.
Definition: ssmp.cpp:17
TIter EndI() const
Returns an iterator referring to the past-the-end element in the vector.
Definition: ds.h:567
void RemoveRow(TInt RowIdx, TInt PrevRowIdx)
Removes row with id RowIdx.
Definition: table.cpp:1115
int Reserved() const
Definition: hash.h:771
Definition: table.h:268
TStrV EdgeAttrV
List of columns (attributes) to serve as edge attributes.
Definition: table.h:601
TStr GetStr() const
Definition: dt.h:1107
THash< GroupStmt, THash< TGroupKey, TIntV > > GroupMapping
Maps grouping statements to their (group-by key –> group id) mapping.
Definition: table.h:591
TInt FirstValidRow
Physical index of first valid row.
Definition: table.h:563
TStr DenormalizeColName(const TStr &ColName) const
Removes suffix to column name if exists.
Definition: table.cpp:4625
int Len() const
Definition: dt.h:487
void GetDatV(TVec< TDat > &DatV) const
Definition: hash.h:450
TInt GetPivot(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc)
Gets pivot element for QSort.
Definition: table.cpp:3090
TInt GetColIdx(const TStr &ColName) const
Gets index of column ColName among columns of the same type in the schema.
Definition: table.h:1004
enum TAttrType_ TAttrType
Types for tables, sparse and dense attributes.
TVec< uint64 > GetStartPosV(uint64 Lb, uint64 Ub) const
Finds start positions of all lines ending somewhere in [Lb, Ub)
Definition: ssmp.cpp:106
void StoreGroupCol(const TStr &GroupColName, const TVec< TPair< TInt, TInt > > &GroupAndRowIds)
Parallel helper function for grouping. - we currently don't support such parallel grouping by complex...
Definition: table.cpp:1290
static const TInt Last
Special value for Next vector entry - last row in table.
Definition: table.h:497
PTable UnionAll(const TTable &Table)
Returns union of this table with given Table, preserving duplicates.
Definition: table.cpp:4488
::TSize GetMemUsed() const
Definition: hash.h:794
static TInt PartitionKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5332
Primitive class: Wrapper around primitive data types.
Definition: table.h:220
bool operator==(const TRowIterator &RowI) const
Checks if this iterator points to the same row pointed by RowI.
Definition: table.cpp:147
TStrV GetSrcNodeIntAttrV() const
Gets src node int attribute name vector.
Definition: table.cpp:985
void PrintGrouping(const THash< TGroupKey, TIntV > &Grouping) const
Definition: table.cpp:1768
Schema Sch
Table Schema.
Definition: table.h:559
void SelectFirstNRows(const TInt &N)
Selects first N rows from the table.
Definition: table.cpp:3337
TStrV GetDstNodeStrAttrV() const
Gets dst node str attribute name vector.
Definition: table.cpp:1062
Definition: ds.h:129
void Del(const TSizeTy &ValN)
Removes the element at position ValN.
Definition: ds.h:1130
void GetPartitionRanges(TIntPrV &Partitions, TInt NumPartitions) const
Partitions the table into NumPartitions and populate Partitions with the ranges.
Definition: table.cpp:1157
TInt GetIntAttr(TInt ColIdx) const
Returns value of integer attribute specified by integer column index for current row.
Definition: table.cpp:155
TPredComp
Comparison operators for selection predicates.
Definition: table.h:7
int Val
Definition: dt.h:1046
void Defrag()
Releases memory of deleted rows, and defrags.
Definition: table.cpp:3291
PNEANet ToVarGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals)
Creates the graph sequence one at a time.
Definition: table.cpp:3649
void SaveBin(const TStr &OutFNm)
Saves table schema and content to a binary file.
Definition: table.cpp:829
TStr GetStrAttr(TInt ColIdx) const
Returns value of string attribute specified by string column index for current row.
Definition: table.cpp:163
void Save(TSOut &SOut) const
Definition: dt.h:1060
void AddIntCol(const TStr &ColName)
Adds an integer column with name ColName.
Definition: table.cpp:4650
THash< TStr, TPair< TAttrType, TInt > > ColTypeMap
Definition: table.h:574
TStr Rvar
Right variable of the comparison op.
Definition: table.h:21
static const int Mx
Definition: dt.h:1049
Definition: table.h:266
void ThresholdJoinCountCollisions(const TTable &TB, const TTable &TS, const TIntIntVH &T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS, THash< TIntPr, TIntTr > &Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType)
Definition: table.cpp:2486
void AddGraphAttributeV(TStrV &Attrs, TBool IsEdge, TBool IsSrc, TBool IsDst)
Adds vector of names of columns to be used as graph attributes.
Definition: table.cpp:972
void GroupByIntColMP(const TStr &GroupBy, THashMP< TInt, TIntV > &Grouping, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with integer values, using OpenMP multi-threading.
Definition: table.cpp:1205
void SetFltColToConstMP(TInt UpdateColIdx, TFlt DefaultFltVal)
Definition: table.cpp:4129
int GetFlds() const
Returns the number of fields in the current line.
Definition: ssmp.h:51
const TVal1 & GetVal1() const
Definition: ds.h:60
void ThresholdJoinCountPerJoinKeyCollisions(const TTable &TB, const TTable &TS, const TIntIntVH &T, TInt JoinColIdxB, TInt KeyColIdxB, TInt KeyColIdxS, THash< TIntTr, TIntTr > &Counters, TBool ThisIsSmaller, TAttrType JoinColType, TAttrType KeyType)
Definition: table.cpp:2537
uint64 GetStreamPos() const
Returns position of stream pointer.
Definition: ssmp.h:89
TIter BegI() const
Definition: hash.h:171
void ColAdd(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise addition. See TTable::ColGenericOp.
Definition: table.cpp:4793
TArithOp
Possible column-wise arithmetic operations.
Definition: table.h:268
TInt RequestIndexStrMap(const TStr &ColName)
Creates Index for Str Column ColName.
Definition: table.cpp:5491
double Val
Definition: dt.h:1295
Definition: fl.h:319
TFlt GetNextFltAttr(TInt ColIdx) const
Returns value of float attribute specified by float column index for next row.
Definition: table.cpp:252
TSizeTy Len() const
Returns the number of elements in the vector.
Definition: ds.h:547
Definition: table.h:268
void AddSelectedRows(const TTable &Table, const TIntV &RowIDs)
Adds rows from Table that correspond to ids in RowIDs.
Definition: table.cpp:4376
int Len() const
Definition: hash.h:770
TStr IdColName
A mapping from column name to column type and column index among columns of the same type...
Definition: table.h:575
Predicate - encapsulates comparison operations.
Definition: table.h:82
TBool CompareAtomicConstTStr(TInt ColIdx, const TStr &Val, TPredComp Cmp)
Compares value in column ColIdx with given TStr Val.
Definition: table.cpp:208
PTable SelfSimJoinPerGroup(const TStr &GroupAttr, const TStr &SimCol, const TStr &DistanceColName, const TSimType &SimType, const TFlt &Threshold)
Performs join if the distance between two rows is less than the specified threshold.
Definition: table.cpp:2074
static TStrV NormalizeColNameV(const TStrV &Cols)
Adds suffix to column name if it doesn't exist.
Definition: table.h:549
static TInt CompareKeyVal(const TInt &K1, const TInt &V1, const TInt &K2, const TInt &V2)
Definition: table.cpp:5274
THash< TStr, THash< TInt, TIntV > > StrMapColIndexes
Indexes for String Columns.
Definition: table.h:579
THash< TStr, THash< TInt, TIntV > > IntColIndexes
Indexes for Int Columns.
Definition: table.h:578
void ColConcat(const TStr &Attr1, const TStr &Attr2, const TStr &Sep="", const TStr &ResAttr="")
Concatenates two string columns.
Definition: table.cpp:5060
void Save(TSOut &SOut) const
Definition: hash.h:141
TStrV GetSrcNodeStrAttrV() const
Gets src node str attribute name vector.
Definition: table.cpp:1051
TTableContext * Context
Execution Context.
Definition: table.h:555
TSimType
Distance metrics for similarity joins.
Definition: table.h:149
TBool Start
A flag indicating whether the current row in the first valid row of the table.
Definition: table.h:386
void QSort(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Performs QSort on given vector V.
Definition: table.cpp:3134
TAttrType Type
Type of the predicate variables.
Definition: table.h:17
TPredicateNode * Left
Left child of this node.
Definition: table.h:57
THash< TStr, TInt > IntVars
Int variables in the current predicate tree.
Definition: table.h:84
Definition: ss.h:72
void InvalidateAffectedGroupings(const TStr &Attr)
Definition: table.cpp:1561
void Dump(FILE *OutF=stdout) const
Prints table contents to a text file.
Definition: table.cpp:867
TInt LastValidRow
Physical index of last valid row.
Definition: table.h:564
void Group(const TStrV &GroupBy, const TStr &GroupColName, TBool Ordered=true, TBool UsePhysicalIds=true)
Groups rows depending on values of GroupBy columns.
Definition: table.cpp:1549
TStr GetSubStr(const int &BChN, const int &EChN) const
Definition: dt.cpp:811
void ResizeTable(int RowCount)
Resizes the table to hold RowCount rows.
Definition: table.cpp:4307
void PrintContextSize()
Definition: table.cpp:3937
static TInt GetMP()
Definition: table.h:537
TAttrAggr
Possible policies for aggregating node attributes.
Definition: table.h:266
void ColDiv(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise division. See TTable::ColGenericOp.
Definition: table.cpp:4805
void Rename(const TStr &Column, const TStr &NewLabel)
Renames a column.
Definition: table.cpp:1085
void GroupAux(const TStrV &GroupBy, THash< TGroupKey, TPair< TInt, TIntV > > &Grouping, TBool Ordered, const TStr &GroupColName, TBool KeepUnique, TIntV &UniqueVec, TBool UsePhysicalIds=true)
Helper function for grouping.
Definition: table.cpp:1302
const TVal2 & GetVal2() const
Definition: ds.h:61
TVal1 Val1
Definition: ds.h:131
TStrV GetEdgeFltAttrV() const
Gets edge float attribute name vector.
Definition: table.cpp:1040
Definition: table.h:149
bool GetInt(const int &FldN, int &Val) const
If the field FldN is an integer its value is returned in Val and the function returns true...
Definition: ss.cpp:447
TStr GetNextStrAttr(TInt ColIdx) const
Returns value of string attribute specified by string column index for next row.
Definition: table.cpp:256
Execution context.
Definition: table.h:194
const TDat & GetDat(const TKey &Key) const
Definition: hash.h:220
Node iterator. Only forward iteration (operator++) is supported.
Definition: network.h:1632
void GetStrAttrNames(TStrV &Names) const
Gets vector of str attribute names.
Definition: network.h:1740
TIter EndI() const
Definition: hash.h:176
void Clr()
Definition: bd.h:502
Schema GetSchema()
Gets the schema of this table.
Definition: table.h:1116
TVec< TIntV > RowIdBuckets
Partitioning of row ids into buckets corresponding to different graph objects when generating a seque...
Definition: table.h:609
TRowIteratorWithRemove BegRIWR()
Gets iterator with reomve to the first valid row.
Definition: table.h:1236
TInt GetNumValidRows() const
Gets number of valid, i.e. not deleted, rows in this table.
Definition: table.h:1225
TRowIterator BegRI() const
Gets iterator to the first valid row of the table.
Definition: table.h:1232
int GetFlds() const
Returns the number of fields in the current line.
Definition: ss.h:116
PNEANet ToGraphPerGroupIterator(TStr GroupAttr, TAttrAggr AggrPolicy)
Creates the graph sequence one at a time.
Definition: table.cpp:3654
TVec< TIntV > IntCols
Next[i] is the successor of row i. Table iterators follow the order dictated by Next ...
Definition: table.h:568
Iterator class for TTable rows, that allows logical row removal while iterating.
Definition: table.h:383
TSizeTy GetMemUsed() const
Returns the memory footprint (the number of bytes) of the vector.
Definition: ds.h:483
void CheckAndAddIntNode(PNEANet Graph, THashSet< TInt > &NodeVals, TInt NodeId)
Checks if given NodeId is seen earlier; if not, add it to Graph and hashmap NodeVals.
Definition: table.cpp:3368
TVec< PNEANet > ToGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal=TInt::Mn, TInt EndVal=TInt::Mx)
Creates a sequence of graphs based on values of column SplitAttr and windows specified by JumpSize an...
Definition: table.cpp:3629
void GroupByFltCol(const TStr &GroupBy, T &Grouping, const TIntV &IndexSet, TBool All, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with float values. Returns hash table with grouping.
Definition: table.h:1633
TInt GetStrMapByName(const TStr &Col) const
Returns integer mapping of string attribute specified by attribute name for current row...
Definition: table.cpp:181
PTable Minus(TTable &Table)
Returns table with rows that are present in this table but not in given Table.
Definition: table.cpp:4569
bool IsKey(const TKey &Key) const
Definition: shash.h:1148
static PTable GetNodeTable(const PNEANet &Network, TTableContext *Context)
Extracts node TTable from PNEANet.
Definition: table.cpp:3667
THash< TStr, TStr > StrVars
String variables in the current predicate tree.
Definition: table.h:86
TIntV GetStrRowIdxByMap(const TStr &ColName, const TInt &Map) const
Gets the rows containing int mapping Map in str column ColName.
Definition: table.cpp:5408
int GetId() const
Returns edge ID.
Definition: network.h:1722
TStr GetIdColName() const
Gets name of the id column of this table.
Definition: table.h:646
static TBool EvalStrAtom(const TStr &Val1, const TStr &Val2, TPredComp Cmp)
Compare atomic string values Val1 and Val2 using predicate Cmp.
Definition: table.h:123
Definition: gbase.h:23
TRowIteratorWithRemove()
Default constructor.
Definition: table.h:389
static void LoadSSSeq(PTable &NewTable, const Schema &S, const TStr &InFNm, const TIntV &RelevantCols, const char &Separator, TBool HasTitleLine)
Sequentially loads data from input file at InFNm into NewTable.
Definition: table.cpp:649
Definition: table.h:7
Definition: dt.h:1293
Definition: fl.h:58
void Save(TSOut &SOut) const
Definition: ds.h:903
void IncrementNext()
Increments the next vector and set last, NumRows and NumValidRows.
Definition: table.cpp:2235
PTable SimJoin(const TStrV &Cols1, const TTable &Table, const TStrV &Cols2, const TStr &DistanceColName, const TSimType &SimType, const TFlt &Threshold)
Performs join if the distance between two rows is less than the specified threshold.
Definition: table.cpp:1974
bool Empty() const
Tests whether the vector is empty.
Definition: ds.h:542
void InitIds()
Adds explicit row ids, initialize hash set mapping ids to physical rows.
Definition: table.cpp:1863
TStrTrV CommonNodeAttrs
List of attribute pairs with values common to source and destination and their common given name...
Definition: table.h:604
void QSortPar(TIntV &V, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Performs QSort in parallel on given vector V.
Definition: table.cpp:3186
void Save(TSOut &SOut)
Saves table schema and content to a binary format.
Definition: table.cpp:834
int GetDstNId() const
Returns the destination of the edge.
Definition: network.h:1726
void Swap(TVec< TVal, TSizeTy > &Vec)
Swaps the contents of the vector with Vec.
Definition: ds.h:1047
TBool Result
Result of evaulating the predicate rooted at this node.
Definition: table.h:54
void ReadFltCol(const TStr &ColName, TFltV &Result) const
Reads values of entire float column into Result.
Definition: table.cpp:5198
void InvalidatePhysicalGroupings()
Definition: table.cpp:1557
TIter EndI() const
Definition: hashmp.h:156
void SkipCommentLines()
Skips lines that begin with a comment character.
Definition: ssmp.cpp:12
TPair< TIntV, TFltV > TGroupKey
Represents grouping key with IntV for integer and string attributes and FltV for float attributes...
Definition: table.h:145
Iterator class for TTable rows.
Definition: table.h:339
TInt GetNextRowIdx() const
Gets physical index of next row.
Definition: table.cpp:243
TVal2 Val2
Definition: ds.h:132
int GetId() const
Returns ID of the current node.
Definition: network.h:1647
static const int Mn
Definition: dt.h:1048
bool Eof() const
Checks for end of file.
Definition: ss.h:122
void Aggregate(const TStrV &GroupByAttrs, TAttrAggr AggOp, const TStr &ValAttr, const TStr &ResAttr, TBool Ordered=true)
Aggregates values of ValAttr after grouping with respect to GroupByAttrs. Result are stored as new at...
Definition: table.cpp:1565
TAttrType GetSchemaColType(TInt Idx) const
Gets type of the column with index Idx in the schema.
Definition: table.h:650
void Clr(const bool &DoDel=true, const TSizeTy &NoDelLim=-1)
Clears the contents of the vector.
Definition: ds.h:971
void SetIntVal(TStr VarName, TInt VarVal)
Set int variable value in the predicate or all the children that use it.
Definition: table.h:98
TStrV GetEdgeIntAttrV() const
Gets edge int attribute name vector.
Definition: table.cpp:1007
Definition: table.h:149
void SetStrVal(TStr VarName, TStr VarVal)
Set str variable value in the predicate or all the children that use it.
Definition: table.h:102
void RemoveNext()
Removes next row.
Definition: table.cpp:278
TStr StrConst
Str const value if this object is a string constant.
Definition: table.h:24
TVec< PNEANet > ToGraphPerGroup(TStr GroupAttr, TAttrAggr AggrPolicy)
Creates a sequence of graphs based on grouping specified by GroupAttr.
Definition: table.cpp:3640
const TTable * Table
Reference to table containing this row.
Definition: table.h:341
static void Throw(const TStr &MsgStr)
Definition: ut.h:187
Schema DenormalizeSchema() const
Removes suffix to column names in the Schema.
Definition: table.cpp:4642
PNEANet NextGraphIterator()
Calls to this must be preceded by a call to one of the above ToGraph*Iterator functions.
Definition: table.cpp:3659
void PutAll(const TVal &Val)
Sets all elements of the vector to value Val.
Definition: ds.h:1166
unsigned long long uint64
Definition: bd.h:38
PNEANet BuildGraph(const TIntV &RowIds, TAttrAggr AggrPolicy)
Makes a single pass over the rows in the given row id set, and creates nodes, edges, assigns node and edge attributes.
Definition: table.cpp:3425
TBool EvalAtomicPredicate(const TAtomicPredicate &Atom)
Evaluate the give atomic predicate.
Definition: table.cpp:102
void ColSub(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise subtraction. See TTable::ColGenericOp.
Definition: table.cpp:4797
int GetSrcNId() const
Gets the source node of an edge.
Definition: graphmp.h:116
const TVal & GetDat(const TVal &Val) const
Returns reference to the first occurrence of element Val.
Definition: ds.h:807
int GetEmptyRowsStart(int NewRows)
Gets the start index to a chunk of empty rows of size NewRows.
Definition: table.cpp:4353
void PrintSize()
Definition: table.cpp:3908
THash< TStr, THash< TFlt, TIntV > > FltColIndexes
Indexes for Float Columns.
Definition: table.h:580
TStr Lvar
Left variable of the comparison op.
Definition: table.h:20
const char * GetKey(const int &KeyId) const
Definition: hash.h:821
void ProjectInPlace(const TStrV &ProjectCols)
Keeps only the columns specified in ProjectCols.
Definition: table.cpp:5216
TStr GetStr() const
Definition: table.h:237
TBool CompareAtomicConst(TInt ColIdx, const TPrimitive &Val, TPredComp Cmp)
Compares value in column ColIdx with given primitive Val.
Definition: table.cpp:190
size_t TSize
Definition: bd.h:58
#define Assert(Cond)
Definition: bd.h:251
void Reindex()
Reinitializes row ids.
Definition: table.cpp:1869
TInt CurrBucket
Current row id bucket - used when generating a sequence of graphs using an iterator.
Definition: table.h:610
PTable IsNextK(const TStr &OrderCol, TInt K, const TStr &GroupBy, const TStr &RankColName="")
Distance based filter.
Definition: table.cpp:3869
TAttrType GetColType(const TStr &ColName) const
Gets type of column ColName.
Definition: table.h:1218
TVec< TIntV > StrColMaps
Data columns of integer mappings of string attributes.
Definition: table.h:570
int sync_bool_compare_and_swap(int *lock)
Definition: table.cpp:4147
TRowIteratorWithRemove & Next()
Increments the iterator (For Python compatibility).
Definition: table.cpp:222
PNEANet ToGraphSequenceIterator(TStr SplitAttr, TAttrAggr AggrPolicy, TInt WindowSize, TInt JumpSize, TInt StartVal=TInt::Mn, TInt EndVal=TInt::Mx)
Creates the graph sequence one at a time.
Definition: table.cpp:3644
Definition: table.h:149
int GetDstNId() const
Gets destination node of an edge.
Definition: graphmp.h:118
int AddKey(const TKey &Key)
Definition: shash.h:1254
::TSize GetMemUsed() const
Definition: hash.h:159
void GroupByIntCol(const TStr &GroupBy, T &Grouping, const TIntV &IndexSet, TBool All, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with integer values.
Definition: table.h:1605
PTable Join(const TStr &Col1, const TTable &Table, const TStr &Col2)
Performs equijoin.
Definition: table.cpp:2252
bool IsKey(const TKey &Key) const
Definition: hashmp.h:191
static void LoadSSPar(PTable &NewTable, const Schema &S, const TStr &InFNm, const TIntV &RelevantCols, const char &Separator, TBool HasTitleLine)
Parallelly loads data from input file at InFNm into NewTable. Only work when NewTable has no string c...
Definition: table.cpp:487
TIntV GetIntRowIdxByVal(const TStr &ColName, const TInt &Val) const
Gets the rows containing Val in int column ColName.
Definition: table.cpp:5387
TInt GetRowIdx() const
Gets the id of the row pointed by this iterator.
Definition: table.cpp:151
bool GetFlt(const int &FldN, double &Val) const
If the field FldN is a float its value is returned in Val and the function returns true...
Definition: ss.cpp:485
A class representing a cached grouping statement identifier.
Definition: table.h:275
TStr GetSchemaColName(TInt Idx) const
Gets name of the column with index Idx in the schema.
Definition: table.h:648
int GetSrcNId() const
Returns the source of the edge.
Definition: network.h:1724
TInt GetStrMapById(TInt ColIdx) const
Returns integer mapping of a string attribute value specified by string column index for current row...
Definition: table.cpp:186
TStrV SrcNodeAttrV
List of columns (attributes) to serve as source node attributes.
Definition: table.h:602
TAttrAggr AggrPolicy
Aggregation policy used for solving conflicts between different values of an attribute of the same no...
Definition: table.h:611
static void QSortKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5355
void Select(TPredicate &Predicate, TIntV &SelectedRows, TBool Remove=true)
Selects rows that satisfy given Predicate.
Definition: table.cpp:2730
void UnionAllInPlace(const TTable &Table)
Same as TTable::ConcatTable.
Definition: table.cpp:4501
TInt GetInt() const
Definition: table.h:235
char GetCh(const int &ChN) const
Definition: dt.h:483
TIntIntH RowIdMap
Mapping of permanent row ids to physical id.
Definition: table.h:576
void SaveSS(const TStr &OutFNm)
Saves table schema and content to a TSV file.
Definition: table.cpp:780
PTable Union(const TTable &Table)
Returns union of this table with given Table.
Definition: table.cpp:4508
void SelectAtomicConst(const TStr &Col, const TPrimitive &Val, TPredComp Cmp, TIntV &SelectedRows, PTable &SelectedTable, TBool Remove=true, TBool Table=true)
Selects rows where the value of Col matches given primitive Val.
Definition: table.cpp:2853
Definition: table.h:5
void UpdateFltFromTable(const TStr &KeyAttr, const TStr &UpdateAttr, const TTable &Table, const TStr &FKeyAttr, const TStr &ReadAttr, TFlt DefaultFltVal=0.0)
Definition: table.cpp:4219
Edge iterator. Only forward iteration (operator++) is supported.
Definition: graphmp.h:99
void ColConcatConst(const TStr &Attr1, const TStr &Val, const TStr &Sep="", const TStr &ResAttr="")
Concatenates column values with given string value.
Definition: table.cpp:5159
Definition: fl.h:128
void GetCollidingRows(const TTable &T, THashSet< TInt > &Collisions)
Gets set of row ids of rows common with table T.
Definition: table.cpp:3991
void AddGraphAttribute(const TStr &Attr, TBool IsEdge, TBool IsSrc, TBool IsDst)
Adds names of columns to be used as graph attributes.
Definition: table.cpp:965
void KeepSortedRows(const TIntV &KeepV)
Removes all rows that are not mentioned in the SORTED vector KeepV.
Definition: table.cpp:1132
TPair< TAttrType, TInt > GetColTypeMap(const TStr &ColName) const
Gets column type and index of ColName.
Definition: table.h:676
TAttrType GetType() const
Definition: table.h:238
THash< TInt, TInt > TIntH
Definition: hash.h:565
void GroupingSanityCheck(const TStr &GroupBy, const TAttrType &AttrType) const
Checks if grouping key exists and matches given attr type.
Definition: table.cpp:1195
void GetFltAttrNames(TStrV &Names) const
Gets vector of flt attribute names.
Definition: network.h:1744
TStrHash< TInt, TBigStrPool > StringVals
StringPool - stores string data values and maps them to integers.
Definition: table.h:196
void UpdateTableForNewRow()
Updates table state after adding one or more rows.
Definition: table.cpp:4117
void SetVal(const TSizeTy &ValN, const TVal &Val)
Sets the value of element at position ValN to Val.
Definition: ds.h:625
int AddKey(const char *Key)
Definition: hash.h:896
static TInt UseMP
Global switch for choosing multi-threaded versions of TTable functions.
Definition: table.h:500
virtual void Flush()=0
TPredComp Compare
Comparison op represented by this node.
Definition: table.h:19
void DelColType(const TStr &ColName)
Adds column with name ColName and type ColType to the ColTypeMap.
Definition: table.h:671
Definition: dt.h:1044
void ReadIntCol(const TStr &ColName, TIntV &Result) const
Reads values of entire int column into Result.
Definition: table.cpp:5189
void FillBucketsByWindow(TStr SplitAttr, TInt JumpSize, TInt WindowSize, TInt StartVal, TInt EndVal)
Fills RowIdBuckets with sets of row ids.
Definition: table.cpp:3527
static TStr NormalizeColName(const TStr &ColName)
Adds suffix to column name if it doesn't exist.
Definition: table.h:540
void AddStrCol(const TStr &ColName)
Adds a string column with name ColName.
Definition: table.cpp:4664
THash< TStr, GroupStmt > GroupStmtNames
Maps user-given grouping statement names to their group-by attributes.
Definition: table.h:583
TRowIterator & Next()
Increments the iterator (For Python compatibility).
Definition: table.cpp:135
TStr SrcCol
Column (attribute) to serve as src nodes when constructing the graph.
Definition: table.h:599
void GetIntAttrNames(TStrV &Names) const
Gets vector of int attribute names.
Definition: network.h:1689
void ISort(const TSizeTy &MnLValN, const TSizeTy &MxRValN, const bool &Asc)
Insertion sorts the values between positions MnLValN...MxLValN.
Definition: ds.h:1184
PTable Project(const TStrV &ProjectCols)
Returns table with only the columns in ProjectCols.
Definition: table.cpp:4592
void StoreStrCol(const TStr &ColName, const TStrV &ColVals)
Adds entire str column to table.
Definition: table.cpp:4098
TPredicateNode * Right
Definition: table.h:58
TVec< TFltV > FltCols
Data columns of floating point attributes.
Definition: table.h:569
TVec< TStr > TStrV
Definition: ds.h:1534
TStrV GetDstNodeFltAttrV() const
Gets dst node float attribute name vector.
Definition: table.cpp:1029
TStrV DstNodeAttrV
List of columns (attributes) to serve as destination node attributes.
Definition: table.h:603
uint64 CountNewLinesInRange(uint64 Lb, uint64 Ub) const
Counts number of occurences of ' ' in [Lb, Ub)
Definition: ssmp.cpp:102
Edge iterator. Only forward iteration (operator++) is supported.
Definition: network.h:1707
TIntV Next
A vector describing the logical order of the rows.
Definition: table.h:565
static int GetRnd(const int &Range=0)
Definition: dt.h:1085
Definition: ds.h:32
void Gen(const int &ExpectVals)
Definition: hashmp.h:160
int AddKey(const TKey &Key)
Definition: hash.h:331
TRowIterator EndRI() const
Gets iterator to the last valid row of the table.
Definition: table.h:1234
void AddStrVal(const TInt &ColIdx, const TStr &Val)
Adds Val in column with id ColIdx.
Definition: table.cpp:951
TTable * Table
Reference to table containing this row.
Definition: table.h:385
int GetIntFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets integer at field FldN.
Definition: ssmp.cpp:152
void AddRow(const TRowIterator &RI)
Adds row corresponding to RI.
Definition: table.cpp:4272
void NextFromIndex(uint64 Index, TVec< char * > &FieldsV)
Loads next line starting from a given position.
Definition: ssmp.cpp:128
TInt NumRows
Number of rows in the table (valid and invalid).
Definition: table.h:561
TFlt GetFltVal(const TStr &ColName, const TInt &RowIdx)
Gets the value of float attribute ColName at row RowIdx.
Definition: table.h:1015
static PTable LoadSS(const Schema &S, const TStr &InFNm, TTableContext *Context, const char &Separator= '\t', TBool HasTitleLine=false)
Loads table from spread sheet (TSV, CSV, etc). Note: HasTitleLine = true is not supported. Please comment title lines instead.
Definition: table.cpp:775
TVec< TFlt > TFltV
Definition: ds.h:1531
void Unique(const TStr &Col)
Removes rows with duplicate values in given column.
Definition: table.cpp:1246
TRowIteratorWithRemove & operator++(int)
Increments the iterator.
Definition: table.cpp:218
void AddJointRow(const TTable &T1, const TTable &T2, TInt RowIdx1, TInt RowIdx2)
Adds joint row T1[RowIdx1]<=>T2[RowIdx2].
Definition: table.cpp:1937
void Classify(TPredicate &Predicate, const TStr &LabelName, const TInt &PositiveLabel=1, const TInt &NegativeLabel=0)
Definition: table.cpp:2785
void Merge(TIntV &V, TInt Idx1, TInt Idx2, TInt Idx3, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Helper function for parallel QSort.
Definition: table.cpp:3158
TStr DstCol
Column (attribute) to serve as dst nodes when constructing the graph.
Definition: table.h:600
TIter BegI() const
Definition: hashmp.h:153
void ReadStrCol(const TStr &ColName, TStrV &Result) const
Reads values of entire string column into Result.
Definition: table.cpp:5207
TStr GetStrVal(TInt ColIdx, TInt RowIdx) const
Gets the value in column with id ColIdx at row RowIdx.
Definition: table.h:636
long long int64
Definition: bd.h:27
void GetKeyV(TVec< TKey > &KeyV) const
Definition: hash.h:442
static PTable GetEdgeTable(const PNEANet &Network, TTableContext *Context)
Extracts edge TTable from PNEANet.
Definition: table.cpp:3719
static const TInt Invalid
Special value for Next vector entry - logically removed row.
Definition: table.h:498
void AddColType(const TStr &ColName, TPair< TAttrType, TInt > ColType)
Adds column with name ColName and type ColType to the ColTypeMap.
Definition: table.h:661
Definition: dt.h:412
PNEANet GetNextGraphFromSequence()
Returns the next graph in sequence corresponding to RowIdBuckets.
Definition: table.cpp:3612
bool Empty() const
Definition: dt.h:488
TBool CompareAtomicConst(TInt ColIdx, const TPrimitive &Val, TPredComp Cmp)
Compares value in column ColIdx with given primitive Val.
Definition: table.cpp:282
void StoreFltCol(const TStr &ColName, const TFltV &ColVals)
Adds entire flt column to table.
Definition: table.cpp:4081
THash< GroupStmt, THash< TInt, TGroupKey > > GroupIDMapping
Maps grouping statements to their (group id –> group-by key) mapping.
Definition: table.h:587
TInt IntConst
Int const value if this object is an integer constant.
Definition: table.h:22
TIter BegI() const
Returns an iterator pointing to the first element in the vector.
Definition: ds.h:565
TPredOp Op
Logical op represented by this node.
Definition: table.h:53
void GroupByStrCol(const TStr &GroupBy, T &Grouping, const TIntV &IndexSet, TBool All, TBool UsePhysicalIds=true) const
Groups/hashes by a single column with string values. Returns hash table with grouping.
Definition: table.h:1660
TTableContext * ChangeContext(TTableContext *Context)
Changes the current context. Moves all object items to the new context.
Definition: table.cpp:901
Definition: hash.h:88
TInt CurrRowIdx
Physical row index of current row pointer by iterator.
Definition: table.h:384
TPredicateNode * Root
Rood node of the current predicate tree.
Definition: table.h:87
Definition: gbase.h:23
Definition: table.h:268
void AggregateCols(const TStrV &AggrAttrs, TAttrAggr AggOp, const TStr &ResAttr)
Aggregates attributes in AggrAttrs across columns.
Definition: table.cpp:1730
bool operator==(const TRowIteratorWithRemove &RowI) const
Checks if this iterator points to the same row pointed by RowI.
Definition: table.cpp:235
Table class: Relational table with columnar data storage.
Definition: table.h:495
bool operator<(const TRowIterator &RowI) const
Checks if this iterator points to a row that is before the one pointed by RowI.
Definition: table.cpp:141
void SetStreamPos(uint64 Pos)
Sets position of stream pointer.
Definition: ssmp.h:97
void UpdateFltFromTableMP(const TStr &KeyAttr, const TStr &UpdateAttr, const TTable &Table, const TStr &FKeyAttr, const TStr &ReadAttr, TFlt DefaultFltVal=0.0)
Definition: table.cpp:4151
static PTable GetEdgeTablePN(const PNGraphMP &Network, TTableContext *Context)
Extracts edge TTable from parallel graph PNGraphMP.
Definition: table.cpp:3777
void ISort(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc=true)
Performs insertion sort on given vector V.
Definition: table.cpp:3076
TInt GetRowIdx() const
Gets physical index of current row.
Definition: table.cpp:239
TInt RequestIndexFlt(const TStr &ColName)
Creates Index for Flt Column ColName.
Definition: table.cpp:5472
static TBool EvalAtom(T Val1, T Val2, TPredComp Cmp)
Compare atomic values Val1 and Val2 using predicate Cmp.
Definition: table.h:110
bool operator<(const TRowIteratorWithRemove &RowI) const
Checks if this iterator points to a row that is before the one pointed by RowI.
Definition: table.cpp:229
void InitRowIdBuckets(int NumBuckets)
Initializes the RowIdBuckets vector which will be used for the graph sequence creation.
Definition: table.cpp:3515
TStrV GetSrcNodeFltAttrV() const
Gets src node float attribute name vector.
Definition: table.cpp:1018
static PTable GetFltNodePropertyTable(const PNEANet &Network, const TIntFltH &Property, const TStr &NodeAttrName, const TAttrType &NodeAttrType, const TStr &PropertyAttrName, TTableContext *Context)
Extracts node and edge property TTables from THash.
Definition: table.cpp:3830
Hash-Table with multiprocessing support.
Definition: hashmp.h:81
PTable ThresholdJoinPerJoinKeyOutputTable(const THash< TIntTr, TIntTr > &Counters, TInt Threshold, const TTable &Table)
Definition: table.cpp:2602
TVal1 Val1
Definition: ds.h:34
PTable ThresholdJoin(const TStr &KeyCol1, const TStr &JoinCol1, const TTable &Table, const TStr &KeyCol2, const TStr &JoinCol2, TInt Threshold, TBool PerJoinKey=false)
Definition: table.cpp:2624
static void ISortKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5298
TBool IsConst
Flag if this atomic node represents a constant value.
Definition: table.h:18
TInt CurrRowIdx
Physical row index of current row pointed by iterator.
Definition: table.h:340
TVal2 Val2
Definition: ds.h:35
TVec< TInt > TIntV
Definition: ds.h:1529
static TInt GetPivotKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5315
void Clr(const bool &DoDel=true, const int &NoDelLim=-1, const bool &ResetDat=true)
Definition: hash.h:319
Definition: table.h:7
bool Next()
Loads next line from the input file.
Definition: ss.cpp:412
Definition: bd.h:196
TInt IsNextDirty
Flag to signify whether the rows are stored in logical sequence or reordered. Used for optimizing Get...
Definition: table.h:613
TStrV GetEdgeStrAttrV() const
Gets edge str attribute name vector.
Definition: table.cpp:1074
Definition: table.h:5
void AddFltCol(const TStr &ColName)
Adds a float column with name ColName.
Definition: table.cpp:4657
TInt CompareRows(TInt R1, TInt R2, const TAttrType &CompareByType, const TInt &CompareByIndex, TBool Asc=true)
Returns positive value if R1 is bigger, negative value if R2 is bigger, and 0 if they are equal (strc...
Definition: table.cpp:3044
TStr RenumberColName(const TStr &ColName) const
Returns a re-numbered column name based on number of existing columns with conflicting names...
Definition: table.cpp:4609
TTriple< TInt, TInt, TInt > TIntTr
Definition: ds.h:170
TInt NumValidRows
Number of valid rows in the table (i.e. rows that were not logically removed).
Definition: table.h:562
TTable()
Definition: table.cpp:302
void Gen(const TSizeTy &_Vals)
Constructs a vector (an array) of _Vals elements.
Definition: ds.h:495
PTable ThresholdJoinOutputTable(const THash< TIntPr, TIntTr > &Counters, TInt Threshold, const TTable &Table)
Definition: table.cpp:2588
void Count(const TStr &CountColName, const TStr &Col)
Counts number of unique elements.
Definition: table.cpp:1782
PTable InitializeJointTable(const TTable &Table)
Initializes an empty table for the join of this table with the given table.
Definition: table.cpp:1896
void ColMax(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs max of two columns. See TTable::ColGenericOp.
Definition: table.cpp:4817
void Reserve(const TSizeTy &_MxVals)
Reserves enough memory for the vector to store _MxVals elements.
Definition: ds.h:515
void ClassifyAtomic(const TStr &Col1, const TStr &Col2, TPredComp Cmp, const TStr &LabelName, const TInt &PositiveLabel=1, const TInt &NegativeLabel=0)
Definition: table.cpp:2846
bool Cmp(const int &RelOp, const TRec &Rec1, const TRec &Rec2)
Definition: bd.h:426
void StoreIntCol(const TStr &ColName, const TIntV &ColVals)
Adds entire int column to table.
Definition: table.cpp:4064
void AddIdColumn(const TStr &IdColName)
Adds a column of explicit integer identifiers to the rows.
Definition: table.cpp:1880
void GetVariables(TStrV &Variables)
Get variables in the predicate tree rooted at this node.
Definition: table.cpp:1
static TInt CheckSortedKeyVal(TIntV &Key, TIntV &Val, TInt Start, TInt End)
Definition: table.cpp:5287
void AddEdgeAttributes(PNEANet &Graph, int RowId)
Adds attributes of edge corresponding to RowId to the Graph.
Definition: table.cpp:3375
void GetIntAttrNames(TStrV &Names) const
Gets vector of int attribute names.
Definition: network.h:1732
Definition: table.h:5
Definition: gbase.h:23
TVec< PNEANet > ToVarGraphSequence(TStr SplitAttr, TAttrAggr AggrPolicy, TIntPrV SplitIntervals)
Creates a sequence of graphs based on values of column SplitAttr and intervals specified by SplitInte...
Definition: table.cpp:3635
char * CStr()
Definition: dt.h:476
TInt GetNextIntAttr(TInt ColIdx) const
Returns value of integer attribute specified by integer column index for next row.
Definition: table.cpp:248
void ColGenericOp(const TStr &Attr1, const TStr &Attr2, const TStr &ResAttr, TArithOp op)
Performs columnwise arithmetic operation.
Definition: table.cpp:4729
void SelectAtomic(const TStr &Col1, const TStr &Col2, TPredComp Cmp, TIntV &SelectedRows, TBool Remove=true)
Selects rows using atomic compare operation.
Definition: table.cpp:2793
TRowIterator & operator++(int)
Increments the iterator.
Definition: table.cpp:131
bool IsKey(const TKey &Key) const
Definition: hash.h:216
void GetVariables(TStrV &Variables)
Get variables in current predicate.
Definition: table.cpp:10
bool IsInt(const int &FldN) const
Checks whether fields FldN is an integer.
Definition: ss.h:143
TSizeTy Add()
Adds a new element at the end of the vector, after its current last element.
Definition: ds.h:574
void ColMin(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs min of two columns. See TTable::ColGenericOp.
Definition: table.cpp:4813
Definition: dt.h:881
void ColMod(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise modulus. See TTable::ColGenericOp.
Definition: table.cpp:4809
static PNEANet New()
Static cons returns pointer to graph. Ex: PNEANet Graph=TNEANet::New().
Definition: network.h:1940
void GetFltAttrNames(TStrV &Names) const
Gets vector of flt attribute names.
Definition: network.h:1701
void RemoveFirstRow()
Removes first valid row of the table.
Definition: table.cpp:1102
bool IsStrIn(const TStr &Str) const
Definition: dt.h:554
TBool IsFirst() const
Checks whether iterator points to first valid row of the table.
Definition: table.cpp:274
void Trunc(const TSizeTy &_Vals=-1)
Truncates the vector's length and capacity to _Vals elements.
Definition: ds.h:982
Atomic predicate - encapsulates comparison operations.
Definition: table.h:15
TBool IsColName(const TStr &ColName) const
Definition: table.h:656
Definition: table.h:268
TInt CheckAndAddFltNode(T Graph, THash< TFlt, TInt > &NodeVals, TFlt FNodeVal)
Checks if given NodeVal is seen earlier; if not, add it to Graph and hashmap NodeVals.
Definition: table.h:1540
TFlt GetFlt() const
Definition: table.h:236
Predicate node - represents a binary predicate operation on two predicate nodes.
Definition: table.h:51
int Len() const
Definition: hash.h:186
static PTable New()
Definition: table.h:931
void AddNodeAttributes(TInt NId, TStrV NodeAttrV, TInt RowId, THash< TInt, TStrIntVH > &NodeIntAttrs, THash< TInt, TStrFltVH > &NodeFltAttrs, THash< TInt, TStrStrVH > &NodeStrAttrs)
Takes as parameters, and updates, maps NodeXAttrs: Node Id –> (attribute name –> Vector of attribut...
Definition: table.cpp:3394
void GetStrAttrNames(TStrV &Names) const
Gets vector of str attribute names.
Definition: network.h:1697
PNEANet GetFirstGraphFromSequence(TAttrAggr AggrPolicy)
Returns the first graph of the sequence.
Definition: table.cpp:3606
TDat & AddDat(const TKey &Key)
Definition: hash.h:196
PTable Intersection(const TTable &Table)
Returns intersection of this table with given Table.
Definition: table.cpp:4544
void AddNJointRowsMP(const TTable &T1, const TTable &T2, const TVec< TIntPrV > &JointRowIDSet)
Adds rows from T1 and T2 to this table in a parallel manner. Used by Join.
Definition: table.cpp:4419
const TDat & GetDat(const TKey &Key) const
Definition: hashmp.h:195
TFlt FltConst
Flt const value if this object is a float constant.
Definition: table.h:23
TBool Eval()
Return the result of evaluating current predicate.
Definition: table.cpp:14
TIntV GetFltRowIdxByVal(const TStr &ColName, const TFlt &Val) const
Gets the rows containing Val in flt column ColName.
Definition: table.cpp:5430
Definition: table.h:268
TSize GetContextMemUsedKB()
Returns approximate memory used by table context in [KB].
Definition: table.cpp:3946
uint64 GetStreamLen() const
Returns length of stream.
Definition: ssmp.h:93
TPredicateNode * Parent
Parent node of this node.
Definition: table.h:56
const TKey & GetKey(const int &KeyId) const
Definition: hash.h:210
TInt GetIntVal(const TStr &ColName, const TInt &RowIdx)
Gets the value of integer attribute ColName at row RowIdx.
Definition: table.h:1011
void AddTable(const TTable &T)
Adds all the rows of the input table. Allows duplicate rows (not a union).
Definition: table.cpp:3952
bool IsCmt() const
Checks whether the current line is a comment (starts with '#').
Definition: ss.h:120
void ColMul(const TStr &Attr1, const TStr &Attr2, const TStr &ResultAttrName="")
Performs columnwise multiplication. See TTable::ColGenericOp.
Definition: table.cpp:4801
TVal3 Val3
Definition: ds.h:133
void ClassifyAux(const TIntV &SelectedRows, const TStr &LabelName, const TInt &PositiveLabel=1, const TInt &NegativeLabel=0)
Adds a label attribute with positive labels on selected rows and negative labels on the rest...
Definition: table.cpp:4671
THash< TStr, TFlt > FltVars
Float variables in the current predicate tree.
Definition: table.h:85
void AddNRows(int NewRows, const TVec< TIntV > &IntColsP, const TVec< TFltV > &FltColsP, const TVec< TIntV > &StrColMapsP)
Adds NewRows rows from the given vectors for each column type.
Definition: table.cpp:4398
TVec< PTable > SpliceByGroup(const TStrV &GroupByAttrs, TBool Ordered=true)
Splices table into subtables according to a grouping statement.
Definition: table.cpp:1788
Definition: table.h:266
int GetKeyId(const char *Key) const
Definition: hash.h:922
Definition: table.h:5
void ColGenericOpMP(TInt ArgColIdx1, TInt ArgColIdx2, TAttrType ArgType1, TAttrType ArgType2, TInt ResColIdx, TArithOp op)
Definition: table.cpp:4685
TVec< PNEANet > GetGraphsFromSequence(TAttrAggr AggrPolicy)
Returns a sequence of graphs.
Definition: table.cpp:3594
TStrV GetDstNodeIntAttrV() const
Gets dst node int attribute name vector.
Definition: table.cpp:996
TAtomicPredicate Atom
Atomic predicate at this node.
Definition: table.h:55
bool IsFlt(const int &FldN) const
Checks whether fields FldN is a float.
Definition: ss.h:148
TSizeTy AddV(const TVec< TVal, TSizeTy > &ValV)
Adds the elements of the vector ValV to the to end of the vector.
Definition: ds.h:1056
TInt Partition(TIntV &V, TInt StartIdx, TInt EndIdx, const TVec< TAttrType > &SortByTypes, const TIntV &SortByIndices, TBool Asc)
Partitions vector for QSort.
Definition: table.cpp:3106
double GetFltFromFldV(TVec< char * > &FieldsV, const int &FldN)
Gets float at field FldN.
Definition: ssmp.cpp:170