24 #include "base/Optionpk.h" 25 #include "fileclasses/FileReaderAscii.h" 26 #include "algorithms/StatFactory.h" 92 int main(
int argc,
char *argv[])
96 Optionpk<char> comment_opt(
"comment",
"comment",
"comment character",
'#');
97 Optionpk<bool> output_opt(
"o",
"output",
"output the selected columns",
false);
98 Optionpk<bool> transpose_opt(
"t",
"transpose",
"transpose input ascii vector (use in combination with --output)",
false);
99 Optionpk<int> col_opt(
"c",
"column",
"column nr, starting from 0", 0);
100 Optionpk<int> range_opt(
"r",
"range",
"rows to start/end reading. Use -r 1 -r 10 to read first 10 rows where first row is header. Use 0 to read all rows with no header.", 0);
103 Optionpk<std::string> randdist_opt(
"dist",
"dist",
"distribution for generating random numbers, see http://www.gn/software/gsl/manual/gsl-ref_toc.html#TOC320 (only uniform and Gaussian supported yet)",
"gaussian");
104 Optionpk<double> randa_opt(
"rnda",
"rnda",
"first parameter for random distribution (mean value in case of Gaussian)", 0);
105 Optionpk<double> randb_opt(
"rndb",
"rndb",
"second parameter for random distribution (standard deviation in case of Gaussian)", 1);
107 Optionpk<bool> median_opt(
"median",
"median",
"calculate median",
false);
109 Optionpk<bool> skewness_opt(
"skew",
"skewness",
"calculate skewness",
false);
110 Optionpk<bool> kurtosis_opt(
"kurt",
"kurtosis",
"calculate kurtosis",
false);
111 Optionpk<bool> stdev_opt(
"stdev",
"stdev",
"calculate standard deviation",
false);
112 Optionpk<bool> sum_opt(
"sum",
"sum",
"calculate sum of column",
false);
113 Optionpk<bool> minmax_opt(
"mm",
"minmax",
"calculate minimum and maximum value",
false);
114 Optionpk<bool> min_opt(
"min",
"min",
"calculate minimum value",
false);
115 Optionpk<bool> max_opt(
"max",
"max",
"calculate maximum value",
false);
116 Optionpk<double> src_min_opt(
"src_min",
"src_min",
"start reading source from this minimum value");
117 Optionpk<double> src_max_opt(
"src_max",
"src_max",
"stop reading source from this maximum value");
118 Optionpk<bool> histogram_opt(
"hist",
"hist",
"calculate histogram",
false);
119 Optionpk<bool> histogram2d_opt(
"hist2d",
"hist2d",
"calculate 2-dimensional histogram based on two columns",
false);
120 Optionpk<short> nbin_opt(
"nbin",
"nbin",
"number of bins to calculate histogram");
121 Optionpk<bool> relative_opt(
"rel",
"relative",
"use percentiles for histogram to calculate histogram",
false);
122 Optionpk<bool> kde_opt(
"kde",
"kde",
"Use Kernel density estimation when producing histogram. The standard deviation is estimated based on Silverman's rule of thumb",
false);
123 Optionpk<bool> correlation_opt(
"cor",
"correlation",
"calculate Pearson produc-moment correlation coefficient between two columns (defined by -c <col1> -c <col2>",
false);
124 Optionpk<bool> rmse_opt(
"rmse",
"rmse",
"calculate root mean square error between two columns (defined by -c <col1> -c <col2>",
false);
125 Optionpk<bool> reg_opt(
"reg",
"regression",
"calculate linear regression between two columns and get correlation coefficient (defined by -c <col1> -c <col2>",
false);
126 Optionpk<bool> regerr_opt(
"regerr",
"regerr",
"calculate linear regression between two columns and get root mean square error (defined by -c <col1> -c <col2>",
false);
127 Optionpk<short> verbose_opt(
"v",
"verbose",
"verbose mode when positive", 0,2);
129 src_min_opt.setHide(1);
130 src_max_opt.setHide(1);
132 range_opt.setHide(1);
133 output_opt.setHide(1);
134 transpose_opt.setHide(1);
135 comment_opt.setHide(1);
140 doProcess=input_opt.retrieveOption(argc,argv);
141 col_opt.retrieveOption(argc,argv);
143 size_opt.retrieveOption(argc,argv);
144 rand_opt.retrieveOption(argc,argv);
145 randdist_opt.retrieveOption(argc,argv);
146 randa_opt.retrieveOption(argc,argv);
147 randb_opt.retrieveOption(argc,argv);
148 mean_opt.retrieveOption(argc,argv);
149 median_opt.retrieveOption(argc,argv);
150 var_opt.retrieveOption(argc,argv);
151 stdev_opt.retrieveOption(argc,argv);
152 skewness_opt.retrieveOption(argc,argv);
153 kurtosis_opt.retrieveOption(argc,argv);
154 sum_opt.retrieveOption(argc,argv);
155 minmax_opt.retrieveOption(argc,argv);
156 min_opt.retrieveOption(argc,argv);
157 max_opt.retrieveOption(argc,argv);
158 histogram_opt.retrieveOption(argc,argv);
159 nbin_opt.retrieveOption(argc,argv);
160 relative_opt.retrieveOption(argc,argv);
161 kde_opt.retrieveOption(argc,argv);
162 histogram2d_opt.retrieveOption(argc,argv);
163 correlation_opt.retrieveOption(argc,argv);
164 rmse_opt.retrieveOption(argc,argv);
165 reg_opt.retrieveOption(argc,argv);
166 regerr_opt.retrieveOption(argc,argv);
168 src_min_opt.retrieveOption(argc,argv);
169 src_max_opt.retrieveOption(argc,argv);
170 fs_opt.retrieveOption(argc,argv);
171 range_opt.retrieveOption(argc,argv);
172 output_opt.retrieveOption(argc,argv);
173 transpose_opt.retrieveOption(argc,argv);
174 comment_opt.retrieveOption(argc,argv);
175 verbose_opt.retrieveOption(argc,argv);
177 catch(
string predefinedString){
178 std::cout << predefinedString << std::endl;
183 cout <<
"Usage: pkstatascii -i input [-c column]*" << endl;
185 std::cout <<
"short option -h shows basic options only, use long option --help to show all options" << std::endl;
189 if(src_min_opt.size()){
190 while(src_min_opt.size()<col_opt.size())
191 src_min_opt.push_back(src_min_opt[0]);
193 if(src_max_opt.size()){
194 while(src_max_opt.size()<col_opt.size())
195 src_max_opt.push_back(src_max_opt[0]);
199 gsl_rng* r=stat.getRandomGenerator(time(NULL));
202 std::cout <<
"generating " << rand_opt[0] <<
" random numbers: " << std::endl;
203 for(
unsigned int i=0;i<rand_opt[0];++i)
204 std::cout << i <<
" " << stat.getRandomValue(r,randdist_opt[0],randa_opt[0],randb_opt[0]) << std::endl;
206 vector< vector<double> > dataVector(col_opt.size());
207 vector< vector<double> > statVector(col_opt.size());
209 if(!input_opt.size())
212 asciiReader.setFieldSeparator(fs_opt[0]);
213 asciiReader.setComment(comment_opt[0]);
214 asciiReader.setMinRow(range_opt[0]);
215 if(range_opt.size()>1)
216 asciiReader.setMaxRow(range_opt[1]);
217 asciiReader.readData(dataVector,col_opt);
218 assert(dataVector.size());
224 if(histogram_opt[0]){
225 stat.minmax(dataVector[0],dataVector[0].begin(),dataVector[0].end(),minValue,maxValue);
226 if(src_min_opt.size())
227 minValue=src_min_opt[0];
228 if(src_max_opt.size())
229 maxValue=src_max_opt[0];
231 std::cerr <<
"Warning: number of bins not defined, calculating bins from min and max value" << std::endl;
232 nbin=maxValue-minValue+1;
239 if(histogram2d_opt[0]){
240 assert(col_opt.size()==2);
242 std::cerr <<
"Warning: number of bins not defined, calculating bins from min and max value" << std::endl;
243 stat.minmax(dataVector[0],dataVector[0].begin(),dataVector[0].end(),minX,maxX);
244 stat.minmax(dataVector[1],dataVector[1].begin(),dataVector[1].end(),minY,maxY);
245 if(src_min_opt.size())
247 if(src_min_opt.size()>1)
249 if(src_max_opt.size())
251 if(src_max_opt.size()>1)
253 minValue=(minX<minY)? minX:minY;
254 maxValue=(maxX>maxY)? maxX:maxY;
256 std::cout <<
"min and max values: " << minValue <<
", " << maxValue << std::endl;
257 nbin=maxValue-minValue+1;
260 for(
int icol=0;icol<col_opt.size();++icol){
261 if(!dataVector[icol].size()){
262 std::cerr <<
"Warning: dataVector[" << icol <<
"] is empty" << std::endl;
266 cout <<
"sample size column " << col_opt[icol] <<
": " << dataVector[icol].size() << endl;
268 cout <<
"mean value column " << col_opt[icol] <<
": " << stat.mean(dataVector[icol]) << endl;
270 cout <<
"variance value column " << col_opt[icol] <<
": " << stat.var(dataVector[icol]) << endl;
272 cout <<
"standard deviation column " << col_opt[icol] <<
": " << sqrt(stat.var(dataVector[icol])) << endl;
274 cout <<
"skewness value column " << col_opt[icol] <<
": " << stat.skewness(dataVector[icol]) << endl;
276 cout <<
"kurtosis value column " << col_opt[icol] <<
": " << stat.kurtosis(dataVector[icol]) << endl;
278 cout << setprecision(2);
279 cout << fixed <<
"sum column " << col_opt[icol] <<
": " << (stat.sum(dataVector[icol])) << endl;
282 cout <<
"median value column " << col_opt[icol] <<
": " << stat.median(dataVector[icol]) << endl;
284 cout <<
"min value column " << col_opt[icol] <<
": " << stat.mymin(dataVector[icol]) << endl;
285 cout <<
"max value column " << col_opt[icol] <<
": " << stat.mymax(dataVector[icol]) << endl;
288 cout <<
"min value column " << col_opt[icol] <<
": " << stat.mymin(dataVector[icol]) << endl;
290 cout <<
"max value column " << col_opt[icol] <<
": " << stat.mymax(dataVector[icol]) << endl;
291 if(histogram_opt[0]){
298 sigma=1.06*sqrt(stat.var(dataVector[icol]))*pow(dataVector[icol].size(),-0.2);
303 std::cout <<
"calculating kernel density estimate with sigma " << sigma <<
" for col " << icol << std::endl;
305 std::cout <<
"calculating histogram for col " << icol << std::endl;
360 stat.distribution(dataVector[icol],dataVector[icol].begin(),dataVector[icol].end(),statVector[icol],nbin,minValue,maxValue,sigma);
362 std::cout <<
"min and max values: " << minValue <<
", " << maxValue << std::endl;
365 if(correlation_opt[0]){
366 assert(dataVector.size()==2);
367 cout <<
"correlation between columns " << col_opt[0] <<
" and " << col_opt[1] <<
": " << stat.correlation(dataVector[0],dataVector[1]) << endl;
370 assert(dataVector.size()==2);
371 cout <<
"root mean square error between columns " << col_opt[0] <<
" and " << col_opt[1] <<
": " << stat.rmse(dataVector[0],dataVector[1]) << endl;
374 assert(dataVector.size()==2);
377 double r2=stat.linear_regression(dataVector[0],dataVector[1],c0,c1);
378 cout <<
"linear regression between columns: " << col_opt[0] <<
" and " << col_opt[1] <<
": " << c0 <<
"+" << c1 <<
" * x " <<
" with R^2 (square correlation coefficient): " << r2 << endl;
381 assert(dataVector.size()==2);
384 double err=stat.linear_regression_err(dataVector[0],dataVector[1],c0,c1);
386 cout <<
"linear regression between columns: " << col_opt[0] <<
" and " << col_opt[1] <<
": " << c0 <<
"+" << c1 <<
" * x " <<
" with rmse: " << err << endl;
388 cout << c0 <<
" " << c1 <<
" " << err << endl;
390 if(histogram_opt[0]){
391 for(
int irow=0;irow<statVector.begin()->size();++irow){
393 if(nbin==maxValue-minValue+1)
394 binValue=minValue+irow;
396 binValue=minValue+
static_cast<double>(maxValue-minValue)*(irow+0.5)/nbin;
397 std::cout << binValue <<
" ";
399 for(
int icol=0;icol<col_opt.size();++icol){
401 std::cout << 100.0*
static_cast<double>(statVector[icol][irow])/static_cast<double>(dataVector[icol].size());
403 std::cout << statVector[icol][irow];
404 if(icol<col_opt.size()-1)
410 if(histogram2d_opt[0]){
412 assert(dataVector.size()==2);
413 assert(dataVector[0].size()==dataVector[1].size());
420 sigma=1.06*sqrt(sqrt(stat.var(dataVector[0]))*sqrt(stat.var(dataVector[1])))*pow(dataVector[0].size(),-0.2);
425 std::cout <<
"calculating 2d kernel density estimate with sigma " << sigma <<
" for cols " << col_opt[0] <<
" and " << col_opt[1] << std::endl;
427 std::cout <<
"calculating 2d histogram for cols " << col_opt[0] <<
" and " << col_opt[1] << std::endl;
428 std::cout <<
"nbin: " << nbin << std::endl;
430 std::vector< std::vector<double> > histVector;
431 stat.distribution2d(dataVector[0],dataVector[1],histVector,nbin,minX,maxX,minY,maxY,sigma);
432 for(
int binX=0;binX<nbin;++binX){
433 std::cout << std::endl;
434 for(
int binY=0;binY<nbin;++binY){
436 if(nbin==maxX-minX+1)
439 binValueX=minX+
static_cast<double>(maxX-minX)*(binX+0.5)/nbin;
441 if(nbin==maxY-minY+1)
444 binValueY=minY+
static_cast<double>(maxY-minY)*(binY+0.5)/nbin;
446 value=
static_cast<double>(histVector[binX][binY])/dataVector[0].size();
447 std::cout << binValueX <<
" " << binValueY <<
" " << value << std::endl;
454 if(transpose_opt[0]){
455 for(
int icol=0;icol<col_opt.size();++icol){
456 for(
int irow=0;irow<dataVector.begin()->size();++irow){
457 cout << dataVector[icol][irow];
458 if(irow<dataVector.begin()->size()-1)
465 for(
int irow=0;irow<dataVector.begin()->size();++irow){
466 for(
int icol=0;icol<col_opt.size();++icol){
467 cout << dataVector[icol][irow];
468 if(icol<col_opt.size()-1)