pktools  2.6.7 Processing Kernel for geospatial data
pkstatascii.cc
1 /**********************************************************************
2 pkstatascii.cc: program to calculate basic statistics from text file
3 Copyright (C) 2008-2014 Pieter Kempeneers
4
5 This file is part of pktools
6
7 pktools is free software: you can redistribute it and/or modify
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 pktools is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with pktools. If not, see <http://www.gnu.org/licenses/>.
19 ***********************************************************************/
20 #include <iostream>
21 #include <fstream>
22 #include <vector>
23 #include <math.h>
24 #include "base/Optionpk.h"
26 #include "algorithms/StatFactory.h"
27 /******************************************************************************/
90 using namespace std;
91
92 int main(int argc, char *argv[])
93 {
94  Optionpk<string> input_opt("i","input","name of the input text file");
95  Optionpk<char> fs_opt("fs","fs","field separator.",' ');
96  Optionpk<char> comment_opt("comment","comment","comment character",'#');
97  Optionpk<bool> output_opt("o","output","output the selected columns",false);
98  Optionpk<bool> transpose_opt("t","transpose","transpose input ascii vector (use in combination with --output)",false);
99  Optionpk<int> col_opt("c", "column", "column nr, starting from 0", 0);
100  Optionpk<int> range_opt("r", "range", "rows to start/end reading. Use -r 1 -r 10 to read first 10 rows where first row is header. Use 0 to read all rows with no header.", 0);
101  Optionpk<bool> size_opt("size","size","sample size",false);
102  Optionpk<unsigned int> rand_opt("rnd", "rnd", "generate random numbers", 0);
103  Optionpk<std::string> randdist_opt("dist", "dist", "distribution for generating random numbers, see http://www.gn/software/gsl/manual/gsl-ref_toc.html#TOC320 (only uniform and Gaussian supported yet)", "gaussian");
104  Optionpk<double> randa_opt("rnda", "rnda", "first parameter for random distribution (mean value in case of Gaussian)", 0);
105  Optionpk<double> randb_opt("rndb", "rndb", "second parameter for random distribution (standard deviation in case of Gaussian)", 1);
106  Optionpk<bool> mean_opt("mean","mean","calculate median",false);
107  Optionpk<bool> median_opt("median","median","calculate median",false);
108  Optionpk<bool> var_opt("var","var","calculate variance",false);
109  Optionpk<bool> skewness_opt("skew","skewness","calculate skewness",false);
110  Optionpk<bool> kurtosis_opt("kurt","kurtosis","calculate kurtosis",false);
111  Optionpk<bool> stdev_opt("stdev","stdev","calculate standard deviation",false);
112  Optionpk<bool> sum_opt("sum","sum","calculate sum of column",false);
113  Optionpk<bool> minmax_opt("mm","minmax","calculate minimum and maximum value",false);
114  Optionpk<bool> min_opt("min","min","calculate minimum value",false);
115  Optionpk<bool> max_opt("max","max","calculate maximum value",false);
116  Optionpk<double> src_min_opt("src_min","src_min","start reading source from this minimum value");
117  Optionpk<double> src_max_opt("src_max","src_max","stop reading source from this maximum value");
118  Optionpk<bool> histogram_opt("hist","hist","calculate histogram",false);
119  Optionpk<bool> histogram2d_opt("hist2d","hist2d","calculate 2-dimensional histogram based on two columns",false);
120  Optionpk<short> nbin_opt("nbin","nbin","number of bins to calculate histogram");
121  Optionpk<bool> relative_opt("rel","relative","use percentiles for histogram to calculate histogram",false);
122  Optionpk<bool> kde_opt("kde","kde","Use Kernel density estimation when producing histogram. The standard deviation is estimated based on Silverman's rule of thumb",false);
123  Optionpk<bool> correlation_opt("cor","correlation","calculate Pearson produc-moment correlation coefficient between two columns (defined by -c <col1> -c <col2>",false);
124  Optionpk<bool> rmse_opt("rmse","rmse","calculate root mean square error between two columns (defined by -c <col1> -c <col2>",false);
125  Optionpk<bool> reg_opt("reg","regression","calculate linear regression between two columns and get correlation coefficient (defined by -c <col1> -c <col2>",false);
126  Optionpk<bool> regerr_opt("regerr","regerr","calculate linear regression between two columns and get root mean square error (defined by -c <col1> -c <col2>",false);
127  Optionpk<short> verbose_opt("v", "verbose", "verbose mode when positive", 0,2);
128
129  src_min_opt.setHide(1);
130  src_max_opt.setHide(1);
131  fs_opt.setHide(1);
132  range_opt.setHide(1);
133  output_opt.setHide(1);
134  transpose_opt.setHide(1);
135  comment_opt.setHide(1);
136
137  bool doProcess;//stop process when program was invoked with help option (-h --help)
138  try{
139  //mandatory options
140  doProcess=input_opt.retrieveOption(argc,argv);
141  col_opt.retrieveOption(argc,argv);
142  //optional options
143  size_opt.retrieveOption(argc,argv);
144  rand_opt.retrieveOption(argc,argv);
145  randdist_opt.retrieveOption(argc,argv);
146  randa_opt.retrieveOption(argc,argv);
147  randb_opt.retrieveOption(argc,argv);
148  mean_opt.retrieveOption(argc,argv);
149  median_opt.retrieveOption(argc,argv);
150  var_opt.retrieveOption(argc,argv);
151  stdev_opt.retrieveOption(argc,argv);
152  skewness_opt.retrieveOption(argc,argv);
153  kurtosis_opt.retrieveOption(argc,argv);
154  sum_opt.retrieveOption(argc,argv);
155  minmax_opt.retrieveOption(argc,argv);
156  min_opt.retrieveOption(argc,argv);
157  max_opt.retrieveOption(argc,argv);
158  histogram_opt.retrieveOption(argc,argv);
159  nbin_opt.retrieveOption(argc,argv);
160  relative_opt.retrieveOption(argc,argv);
161  kde_opt.retrieveOption(argc,argv);
162  histogram2d_opt.retrieveOption(argc,argv);
163  correlation_opt.retrieveOption(argc,argv);
164  rmse_opt.retrieveOption(argc,argv);
165  reg_opt.retrieveOption(argc,argv);
166  regerr_opt.retrieveOption(argc,argv);
168  src_min_opt.retrieveOption(argc,argv);
169  src_max_opt.retrieveOption(argc,argv);
170  fs_opt.retrieveOption(argc,argv);
171  range_opt.retrieveOption(argc,argv);
172  output_opt.retrieveOption(argc,argv);
173  transpose_opt.retrieveOption(argc,argv);
174  comment_opt.retrieveOption(argc,argv);
175  verbose_opt.retrieveOption(argc,argv);
176  }
177  catch(string predefinedString){
178  std::cout << predefinedString << std::endl;
179  exit(0);
180  }
181  if(!doProcess){
182  cout << endl;
183  cout << "Usage: pkstatascii -i input [-c column]*" << endl;
184  cout << endl;
185  std::cout << "short option -h shows basic options only, use long option --help to show all options" << std::endl;
186  exit(0);//help was invoked, stop processing
187  }
188
189  if(src_min_opt.size()){
190  while(src_min_opt.size()<col_opt.size())
191  src_min_opt.push_back(src_min_opt[0]);
192  }
193  if(src_max_opt.size()){
194  while(src_max_opt.size()<col_opt.size())
195  src_max_opt.push_back(src_max_opt[0]);
196  }
198  if(rand_opt[0]>0){
199  gsl_rng* r=stat.getRandomGenerator(time(NULL));
200  //todo: init random number generator using time...
201  if(verbose_opt[0])
202  std::cout << "generating " << rand_opt[0] << " random numbers: " << std::endl;
203  for(unsigned int i=0;i<rand_opt[0];++i)
204  std::cout << i << " " << stat.getRandomValue(r,randdist_opt[0],randa_opt[0],randb_opt[0]) << std::endl;
205  }
206  vector< vector<double> > dataVector(col_opt.size());
207  vector< vector<double> > statVector(col_opt.size());
208
209  if(!input_opt.size())
210  exit(0);
215  if(range_opt.size()>1)
218  assert(dataVector.size());
219  double minValue=0;
220  double maxValue=0;
221  unsigned int nbin=0;
222  if(nbin_opt.size())
223  nbin=nbin_opt[0];
224  if(histogram_opt[0]){
225  stat.minmax(dataVector[0],dataVector[0].begin(),dataVector[0].end(),minValue,maxValue);
226  if(src_min_opt.size())
227  minValue=src_min_opt[0];
228  if(src_max_opt.size())
229  maxValue=src_max_opt[0];
230  if(nbin<1){
231  std::cerr << "Warning: number of bins not defined, calculating bins from min and max value" << std::endl;
232  nbin=maxValue-minValue+1;
233  }
234  }
235  double minX=0;
236  double minY=0;
237  double maxX=0;
238  double maxY=0;
239  if(histogram2d_opt[0]){
240  assert(col_opt.size()==2);
241  if(nbin<1){
242  std::cerr << "Warning: number of bins not defined, calculating bins from min and max value" << std::endl;
243  stat.minmax(dataVector[0],dataVector[0].begin(),dataVector[0].end(),minX,maxX);
244  stat.minmax(dataVector[1],dataVector[1].begin(),dataVector[1].end(),minY,maxY);
245  if(src_min_opt.size())
246  minX=src_min_opt[0];
247  if(src_min_opt.size()>1)
248  minY=src_min_opt[1];
249  if(src_max_opt.size())
250  maxX=src_max_opt[0];
251  if(src_max_opt.size()>1)
252  maxY=src_max_opt[1];
253  minValue=(minX<minY)? minX:minY;
254  maxValue=(maxX>maxY)? maxX:maxY;
255  if(verbose_opt[0])
256  std::cout << "min and max values: " << minValue << ", " << maxValue << std::endl;
257  nbin=maxValue-minValue+1;
258  }
259  }
260  for(int icol=0;icol<col_opt.size();++icol){
261  if(!dataVector[icol].size()){
262  std::cerr << "Warning: dataVector[" << icol << "] is empty" << std::endl;
263  continue;
264  }
265  if(size_opt[0])
266  cout << "sample size column " << col_opt[icol] << ": " << dataVector[icol].size() << endl;
267  if(mean_opt[0])
268  cout << "mean value column " << col_opt[icol] << ": " << stat.mean(dataVector[icol]) << endl;
269  if(var_opt[0])
270  cout << "variance value column " << col_opt[icol] << ": " << stat.var(dataVector[icol]) << endl;
271  if(stdev_opt[0])
272  cout << "standard deviation column " << col_opt[icol] << ": " << sqrt(stat.var(dataVector[icol])) << endl;
273  if(skewness_opt[0])
274  cout << "skewness value column " << col_opt[icol] << ": " << stat.skewness(dataVector[icol]) << endl;
275  if(kurtosis_opt[0])
276  cout << "kurtosis value column " << col_opt[icol] << ": " << stat.kurtosis(dataVector[icol]) << endl;
277  if(sum_opt[0]){
278  cout << setprecision(2);
279  cout << fixed << "sum column " << col_opt[icol] << ": " << (stat.sum(dataVector[icol])) << endl;
280  }
281  if(median_opt[0])
282  cout << "median value column " << col_opt[icol] << ": " << stat.median(dataVector[icol]) << endl;
283  if(minmax_opt[0]){
284  cout << "min value column " << col_opt[icol] << ": " << stat.mymin(dataVector[icol]) << endl;
285  cout << "max value column " << col_opt[icol] << ": " << stat.mymax(dataVector[icol]) << endl;
286  }
287  if(min_opt[0])
288  cout << "min value column " << col_opt[icol] << ": " << stat.mymin(dataVector[icol]) << endl;
289  if(max_opt[0])
290  cout << "max value column " << col_opt[icol] << ": " << stat.mymax(dataVector[icol]) << endl;
291  if(histogram_opt[0]){
292  //todo: support kernel density function and estimate sigma as in practical estimate of the bandwith in http://en.wikipedia.org/wiki/Kernel_density_estimation
293  double sigma=0;
294  if(kde_opt[0]){//.size()){
295  // if(kde_opt[0]>0)
296  // sigma=kde_opt[0];
297  // else
298  sigma=1.06*sqrt(stat.var(dataVector[icol]))*pow(dataVector[icol].size(),-0.2);
299  }
300  assert(nbin);
301  if(verbose_opt[0]){
302  if(sigma>0)
303  std::cout << "calculating kernel density estimate with sigma " << sigma << " for col " << icol << std::endl;
304  else
305  std::cout << "calculating histogram for col " << icol << std::endl;
306  }
307  //test
308  // cout << "debug0" << endl;
309  // cout << "dataVector.size(): " << dataVector.size() << endl;
310  // cout << "statVector.size(): " << statVector.size() << endl;
311
312  // double theMinValue=0;
313  // double theMaxValue=0;
314
315  // stat.minmax(dataVector[icol],dataVector[icol].begin(),dataVector[icol].end(),theMinValue,theMaxValue);
316  // if(minValue<maxValue&&minValue>theMinValue)
317  // theMinValue=minValue;
318  // if(minValue<maxValue&&maxValue<theMaxValue)
319  // theMaxValue=maxValue;
320
321  // //todo: check...
322  // minValue=theMinValue;
323  // maxValue=theMaxValue;
324
325  // if(maxValue<=minValue){
326  // std::ostringstream s;
327  // s<<"Error: could not calculate distribution (min>=max)";
328  // throw(s.str());
329  // }
330  // assert(nbin);
331  // assert(dataVector[icol].size());
332  // if(statVector[icol].size()!=nbin){
333  // statVector[icol].resize(nbin);
334  // for(int i=0;i<nbin;statVector[icol][i++]=0);
335  // }
336  // typename std::vector<double>::const_iterator it;
337  // for(it=dataVector[icol].begin();it!=dataVector[icol].end();++it){
338  // if(*it<minValue)
339  // continue;
340  // if(*it>maxValue)
341  // continue;
342  // if(stat.isNoData(*it))
343  // continue;
344  // int theBin=0;
345  // if(*it==maxValue)
346  // theBin=nbin-1;
347  // else if(*it>minValue && *it<maxValue)
348  // theBin=static_cast<int>(static_cast<double>((nbin-1)*(*it)-minValue)/(maxValue-minValue));
349  // assert(theBin<statVector[icol].size());
350  // ++statVector[icol][theBin];
351  // // if(*it==maxValue)
352  // // ++statVector[icol][nbin-1];
353  // // else if(*it>=minValue && *it<maxValue)
354  // // ++statVector[icol][static_cast<int>(static_cast<double>((*it)-minValue)/(maxValue-minValue)*nbin)];
355  // }
356
357  // exit(0);
358  //end test
359
360  stat.distribution(dataVector[icol],dataVector[icol].begin(),dataVector[icol].end(),statVector[icol],nbin,minValue,maxValue,sigma);
361  if(verbose_opt[0])
362  std::cout << "min and max values: " << minValue << ", " << maxValue << std::endl;
363  }
364  }
365  if(correlation_opt[0]){
366  assert(dataVector.size()==2);
367  cout << "correlation between columns " << col_opt[0] << " and " << col_opt[1] << ": " << stat.correlation(dataVector[0],dataVector[1]) << endl;
368  }
369  if(rmse_opt[0]){
370  assert(dataVector.size()==2);
371  cout << "root mean square error between columns " << col_opt[0] << " and " << col_opt[1] << ": " << stat.rmse(dataVector[0],dataVector[1]) << endl;
372  }
373  if(reg_opt[0]){
374  assert(dataVector.size()==2);
375  double c0=0;
376  double c1=0;
377  double r2=stat.linear_regression(dataVector[0],dataVector[1],c0,c1);
378  cout << "linear regression between columns: " << col_opt[0] << " and " << col_opt[1] << ": " << c0 << "+" << c1 << " * x " << " with R^2 (square correlation coefficient): " << r2 << endl;
379  }
380  if(regerr_opt[0]){
381  assert(dataVector.size()==2);
382  double c0=0;
383  double c1=0;
384  double err=stat.linear_regression_err(dataVector[0],dataVector[1],c0,c1);
385  if(verbose_opt[0])
386  cout << "linear regression between columns: " << col_opt[0] << " and " << col_opt[1] << ": " << c0 << "+" << c1 << " * x " << " with rmse: " << err << endl;
387  else
388  cout << c0 << " " << c1 << " " << err << endl;
389  }
390  if(histogram_opt[0]){
391  for(int irow=0;irow<statVector.begin()->size();++irow){
392  double binValue=0;
393  if(nbin==maxValue-minValue+1)
394  binValue=minValue+irow;
395  else
396  binValue=minValue+static_cast<double>(maxValue-minValue)*(irow+0.5)/nbin;
397  std::cout << binValue << " ";
398  // std::cout << minValue+static_cast<double>(maxValue-minValue)*(irow+0.5)/nbin << " ";
399  for(int icol=0;icol<col_opt.size();++icol){
400  if(relative_opt[0])
401  std::cout << 100.0*static_cast<double>(statVector[icol][irow])/static_cast<double>(dataVector[icol].size());
402  else
403  std::cout << statVector[icol][irow];
404  if(icol<col_opt.size()-1)
405  cout << " ";
406  }
407  cout << endl;
408  }
409  }
410  if(histogram2d_opt[0]){
411  assert(nbin);
412  assert(dataVector.size()==2);
413  assert(dataVector[0].size()==dataVector[1].size());
414  double sigma=0;
415  //kernel density estimation as in http://en.wikipedia.org/wiki/Kernel_density_estimation
416  if(kde_opt[0]){
417  // if(kde_opt[0]>0)
418  // sigma=kde_opt[0];
419  // else
420  sigma=1.06*sqrt(sqrt(stat.var(dataVector[0]))*sqrt(stat.var(dataVector[1])))*pow(dataVector[0].size(),-0.2);
421  }
422  assert(nbin);
423  if(verbose_opt[0]){
424  if(sigma>0)
425  std::cout << "calculating 2d kernel density estimate with sigma " << sigma << " for cols " << col_opt[0] << " and " << col_opt[1] << std::endl;
426  else
427  std::cout << "calculating 2d histogram for cols " << col_opt[0] << " and " << col_opt[1] << std::endl;
428  std::cout << "nbin: " << nbin << std::endl;
429  }
430  std::vector< std::vector<double> > histVector;
431  stat.distribution2d(dataVector[0],dataVector[1],histVector,nbin,minX,maxX,minY,maxY,sigma);
432  for(int binX=0;binX<nbin;++binX){
433  std::cout << std::endl;
434  for(int binY=0;binY<nbin;++binY){
435  double binValueX=0;
436  if(nbin==maxX-minX+1)
437  binValueX=minX+binX;
438  else
439  binValueX=minX+static_cast<double>(maxX-minX)*(binX+0.5)/nbin;
440  double binValueY=0;
441  if(nbin==maxY-minY+1)
442  binValueY=minY+binY;
443  else
444  binValueY=minY+static_cast<double>(maxY-minY)*(binY+0.5)/nbin;
445  double value=0;
446  value=static_cast<double>(histVector[binX][binY])/dataVector[0].size();
447  std::cout << binValueX << " " << binValueY << " " << value << std::endl;
448  // std::cout << minX+static_cast<double>(maxX-minX)*(binX+0.5)/nbin << " " << minY+static_cast<double>(maxY-minY)*(binY+0.5)/nbin << " " << value << std::endl;
449  }
450  }
451  }
452
453  if(output_opt[0]){
454  if(transpose_opt[0]){
455  for(int icol=0;icol<col_opt.size();++icol){
456  for(int irow=0;irow<dataVector.begin()->size();++irow){
457  cout << dataVector[icol][irow];
458  if(irow<dataVector.begin()->size()-1)
459  cout << " ";
460  }
461  cout << endl;
462  }
463  }
464  else{
465  for(int irow=0;irow<dataVector.begin()->size();++irow){
466  for(int icol=0;icol<col_opt.size();++icol){
467  cout << dataVector[icol][irow];
468  if(icol<col_opt.size()-1)
469  cout << " ";
470  }
471  cout << endl;
472  }
473  }
474  }
475 }
STL namespace.