<?xml version="1.0" encoding="utf-8"?><!DOCTYPE article  PUBLIC '-//OASIS//DTD DocBook XML V4.4//EN'  'http://www.docbook.org/xml/4.4/docbookx.dtd'><article><articleinfo><title>Spinn3rHadoopDataSet</title><revhistory><revision><revnumber>19</revnumber><date>2014-09-06 01:18:21</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>18</revnumber><date>2014-09-06 01:11:19</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>17</revnumber><date>2014-09-05 23:12:44</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>16</revnumber><date>2014-09-05 23:12:26</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>15</revnumber><date>2014-09-05 22:37:17</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>14</revnumber><date>2014-09-05 22:31:10</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>13</revnumber><date>2014-09-05 22:31:01</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>12</revnumber><date>2014-09-05 22:30:43</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>11</revnumber><date>2014-09-05 22:29:31</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>10</revnumber><date>2014-09-05 22:28:29</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>9</revnumber><date>2014-09-05 22:23:18</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>8</revnumber><date>2014-09-05 22:23:09</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>7</revnumber><date>2014-09-05 22:22:48</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>6</revnumber><date>2014-09-05 22:22:32</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>5</revnumber><date>2014-09-05 22:19:29</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>4</revnumber><date>2014-09-05 22:19:07</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>3</revnumber><date>2014-09-05 22:18:20</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>2</revnumber><date>2014-09-05 22:17:59</date><authorinitials>NikoColneric</authorinitials></revision><revision><revnumber>1</revnumber><date>2014-09-05 22:16:43</date><authorinitials>NikoColneric</authorinitials></revision></revhistory></articleinfo><section><title>Spinn3r data set on Hadoop cluster</title><para>This page provides all informaton about Spinn3r data set stored on Hadoop cluster. </para><section><title>Data records versions</title><para>There are several verison in which the records are stored. It is important to know which version are you processings, since depending on the version you know what fields are available for this record and how the text was preprocessed. For example, in some versions there is no capital letters, no raw html fileds, etc. </para><informaltable><tgroup cols="3"><colspec colname="col_0"/><colspec colname="col_1"/><colspec colname="col_2"/><tbody><row rowsep="1"><entry colsep="1" rowsep="1"><para>      <emphasis role="strong">From</emphasis>               </para></entry><entry colsep="1" rowsep="1"><para> <emphasis role="strong">To</emphasis>                     </para></entry><entry colsep="1" rowsep="1"><para> <emphasis role="strong">Version</emphasis></para></entry></row><row rowsep="1"><entry colsep="1" rowsep="1"><para>       2008-08-01       </para></entry><entry colsep="1" rowsep="1"><para>     2010-07-13     </para></entry><entry colsep="1" rowsep="1"><para>         A      </para></entry></row><row rowsep="1"><entry colsep="1" rowsep="1"><para>       2010-07-14       </para></entry><entry colsep="1" rowsep="1"><para>     2010-07-26     </para></entry><entry colsep="1" rowsep="1"><para>         B      </para></entry></row><row rowsep="1"><entry colsep="1" rowsep="1"><para>       2010-07-27       </para></entry><entry colsep="1" rowsep="1"><para>     2013-04-30     </para></entry><entry colsep="1" rowsep="1"><para>         C      </para></entry></row><row rowsep="1"><entry colsep="1" rowsep="1"><para>       2013-05-01       </para></entry><entry colsep="1" rowsep="1"><para>     2014-05-30     </para></entry><entry colsep="1" rowsep="1"><para>         D      </para></entry></row><row rowsep="1"><entry colsep="1" rowsep="1"><para>       2014-06-01       </para></entry><entry colsep="1" rowsep="1"><para>     ...     </para></entry><entry colsep="1" rowsep="1"><para>         E      </para></entry></row></tbody></tgroup></informaltable></section><section><title>More information</title><para><ulink url="http://snap.stanford.edu/moin/Spinn3rHadoopDataSet/moin/Spinn3rFormat#">Spinn3rFormat</ulink> - provides detailed description of version transitions and parsing. </para></section></section></article>