pktools  2.6.7
Processing Kernel for geospatial data
FeatureSelector.h
1 /**********************************************************************
2 FeatureSelector.h: select features, typical use: feature selection for classification
3 Copyright (C) 2008-2012 Pieter Kempeneers
4 
5 This file is part of pktools
6 
7 pktools is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11 
12 pktools is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with pktools. If not, see <http://www.gnu.org/licenses/>.
19 ***********************************************************************/
20 #ifndef _FEATURESELECTOR_H_
21 #define _FEATURESELECTOR_H_
22 
23 #include <math.h>
24 #include <vector>
25 #include <list>
26 #include <algorithm>
27 #include <iostream>
28 #include <iomanip>
29 #include "ConfusionMatrix.h"
30 #include "base/IndexValue.h"
31 #include "base/Vector2d.h"
32 #include "gsl/gsl_combination.h"
33 #include "CostFactory.h"
34 
36 {
37  public:
38  FeatureSelector(){};
39  ~FeatureSelector(){};
40  template<class T> double forward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, short verbose=0);
41  template<class T> double backward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int minFeatures, short verbose=0);
42  template<class T> double floating(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, double epsilon=0.001, short verbose=0);
43  template<class T> double bruteForce(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, short verbose=0);
44 
45  private:
46  template<class T> double addFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, short verbose=0);
47  template<class T> double removeFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int& r, short verbose=0);
48  template<class T> double forwardUnivariate(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures=0, short verbose=0);
49 };
50 
51 //sequential forward selection Univariate (N single best features)
52 template<class T> double FeatureSelector::forwardUnivariate(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, short verbose){
53  int maxLevels=v[0][0].size();
54  if(!maxFeatures)
55  maxFeatures=maxLevels;
56  int k=subset.size();
57  if(k>=maxFeatures)
58  return -1;
59  std::vector<IndexValue> cost(maxLevels);
60  std::list<int> tmpset=subset;//temporary set of selected features (levels)
61  std::vector< Vector2d<T> > tmp(v.size());
62  for(int ilevel=0;ilevel<maxLevels;++ilevel){
63  if(find(tmpset.begin(),tmpset.end(),ilevel)==tmpset.end()){
64  tmpset.push_back(ilevel);
65  for(int iclass=0;iclass<v.size();++iclass){
66 // tmp[iclass].resize(v[iclass].size());
67  v[iclass].selectCols(tmpset,tmp[iclass]);
68  }
69  try{
70  IndexValue pv;
71  pv.position=ilevel;
72  pv.value=theCostFactory.getCost(tmp);
73  cost[ilevel]=pv;
74  }
75  catch(...){
76  IndexValue pv;
77  pv.position=ilevel;
78  pv.value=-1;
79  cost[ilevel]=pv;
80  }
81  tmpset.pop_back();
82  }
83  }
84  sort(cost.begin(),cost.end(),Compare_IndexValue());//decreasing order
85  int ilevel=0;
86  while((subset.size()<maxFeatures)&&(ilevel<maxLevels)){
87  if(cost[ilevel].value>0)
88  subset.push_back(cost[ilevel].position);
89  if(verbose)
90  std::cout << "feature " << subset.back() << " has cost: " << cost[ilevel].value << std::endl;
91  ++ilevel;
92  }
93  double maxCost=-1;
94  while(subset.size()){
95  for(int iclass=0;iclass<v.size();++iclass){
96 // tmp[iclass].resize(v[iclass].size());
97  v[iclass].selectCols(subset,tmp[iclass]);
98  }
99  try{
100  maxCost=theCostFactory.getCost(tmp);
101  }
102  catch(...){
103  subset.pop_back();
104  continue;
105  }
106  return maxCost;
107  }
108 }
109 
110 //sequential forward selection Multivariate (Combination of N best features)
111 template<class T> double FeatureSelector::forward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, short verbose){
112  //Select feature with the best value (get maximal cost for 1 feature)
113  double maxCost=0;
114  int maxLevels=v[0][0].size();
115  if(!maxFeatures)
116  maxFeatures=maxLevels;
117  while(subset.size()<maxFeatures){
118  maxCost=addFeature(v,theCostFactory,subset,verbose);
119  if(verbose){
120  for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
121  std::cout << *lit << " ";
122  std::cout << std::endl;
123  // std::cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << maxCost << std::endl;
124  }
125  }//while
126  return maxCost;
127 }
128 
129 //sequential backward selection
130 template<class T> double FeatureSelector::backward(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int minFeatures, short verbose){
131  //Select features with least effect on cost when removed (obtain minFeatures eventually)
132  double maxCost=0;
133  int removedFeature;
134  if(subset.empty()){
135  for(int iFeature=0;iFeature<v[0][0].size();++iFeature)
136  subset.push_back(iFeature);
137  }
138  if(subset.size()==minFeatures)
139  maxCost=theCostFactory.getCost(v);
140  while(subset.size()>minFeatures){
141  maxCost=removeFeature(v,theCostFactory,subset,removedFeature,verbose);
142  if(verbose){
143  for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
144  std::cout << *lit << " ";
145  std::cout << std::endl;
146  // std::cout << "removed " << removedFeature << ", " << subset.size() << "/" << minFeatures << " features remain with cost: " << maxCost << std::endl;
147  }
148  }//while
149  return maxCost;
150 }
151 
152 //floating search
153 template<class T> double FeatureSelector::floating(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, double epsilon, short verbose){
154  std::vector<T> cost;
155  int maxLevels=v[0][0].size();
156  if(maxFeatures<1)
157  maxFeatures=maxLevels;
158  int k=subset.size();
159  if(k>=maxFeatures)
160  return -1;
161  while(cost.size()<subset.size())
162  cost.push_back(1);//init original features as cost 1
163  cost.push_back(addFeature(v,theCostFactory,subset,verbose));
164  ++k;
165  if(verbose>1)
166  std::cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << cost.back() << std::endl;
167  else if(verbose){
168  for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
169  std::cout << *lit << " ";
170  std::cout << std::endl;
171  }
172  while(k<maxFeatures){
173  cost.push_back(addFeature(v,theCostFactory,subset,verbose));
174  ++k;
175  if(verbose>1)
176  std::cout << "added " << subset.back() << ", " << subset.size() << "/" << maxFeatures << " features selected with cost: " << cost.back() << std::endl;
177  else if(verbose){
178  for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
179  std::cout << *lit << " ";
180  std::cout << " (cost: " << cost.back() << ")" << std::endl;
181  }
182 
183  while(k>1){
184  int x_r;
185  double cost_r=removeFeature(v,theCostFactory,subset,x_r,verbose);
186  if(cost_r>cost[k-1]+epsilon){
187  --k;
188  cost[k]=cost_r;
189  cost.pop_back();
190  if(verbose>1)
191  std::cout << "removed " << x_r << ", " << subset.size() << "/" << maxFeatures << " features remain with cost: " << cost_r << std::endl;
192  else if(verbose){
193  for(std::list<int>::const_iterator lit=subset.begin();lit!=subset.end();++lit)
194  std::cout << *lit << " ";
195  std::cout << " (cost: " << cost.back() << ")" << std::endl;
196  }
197  continue;
198  }
199  else if(cost_r>=0){
200  subset.push_back(x_r);
201  break;
202  }
203  else if(verbose)
204  std::cout << "could not remove any feature" << std::endl;
205  cost.pop_back();
206  }
207  }
208  return cost.back();
209 }
210 
211 //brute force search (search for all possible combinations and select best)
212 template<class T> double FeatureSelector::bruteForce(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int maxFeatures, short verbose){
213  int maxLevels=v[0][0].size();
214  if(maxFeatures<1)
215  maxFeatures=maxLevels;
216  int k=subset.size();
217  if(k>=maxFeatures)
218  return -1;
219 // gslmm::combination c(v1[0].size(),maxFeatures,false);
220  gsl_combination *c;
221  c=gsl_combination_calloc(v[0][0].size(),maxFeatures);
222 
223  std::list<int> tmpset;//temporary set of selected features (levels)
224  std::vector< Vector2d<T> > tmpv(v.size());
225  std::list<int> catchset;//restore set in case of catch all the way to last level (no better cost)
226  //initialize maxCost with actual cost for current subset (-1 if empty subset)
227  double maxCost=-1;
228  double cost;
229  if(subset.size()>=maxLevels)
230  return maxCost;
231  gsl_combination_next(c);
232  do{
233  for(int ifeature=0;ifeature<maxFeatures;++ifeature)
234  tmpset.push_back(c->data[ifeature]);
235  for(int iclass=0;iclass<v.size();++iclass)
236  v[iclass].selectCols(tmpset,tmpv[iclass]);
237  try{
238  cost=theCostFactory.getCost(tmpv);
239  }
240  catch(...){ //singular matrix encountered
241  catchset=tmpset;//this tmpset resulted in failure of getCost
242  if(verbose){
243  std::cout << "Could not get cost from set: " << std::endl;
244  gsl_combination_fprintf(stdout,c," %u");
245  printf("\n");
246  }
247  tmpset.clear();
248  continue;
249  }
250  if(maxCost<cost){ //set with better cost is found
251  maxCost=cost;
252  subset=tmpset;
253  }
254  tmpset.clear();
255  }while(gsl_combination_next(c)==GSL_SUCCESS);
256  gsl_combination_free(c);
257 // }while(c.next());
258  if(maxCost<0) //no level added to better maxCost than current subset (catchset)
259  subset=catchset;
260  //test: assert list contains no duplicate elements
261  for(std::list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){
262  std::list<int>::iterator lit2=lit;//start searching from next element
263  assert(find(++lit2,subset.end(),*lit)==subset.end());
264  }
265  return maxCost;
266 }
267 
268 template<class T> double FeatureSelector::addFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, short verbose){
269  //select feature with the best value (get maximal cost for 1 feature)
270  std::list<int> tmpset=subset;//temporary set of selected features (levels)
271  std::vector< Vector2d<T> > tmp(v.size());
272  std::list<int> catchset;//restore set in case of catch all the way to last level (no better cost)
273  //initialize maxCost with actual cost for current subset (-1 if empty subset)
274  double maxCost=-1;
275  double cost;
276  int maxLevels=v[0][0].size();
277  if(subset.size()>=maxLevels)
278  return maxCost;
279  for(int ilevel=0;ilevel<maxLevels;++ilevel){
280  if(find(tmpset.begin(),tmpset.end(),ilevel)!=tmpset.end())
281  continue;
282  tmpset.push_back(ilevel);
283  for(int iclass=0;iclass<v.size();++iclass){
284 // tmp[iclass].resize(v[iclass].size());
285  v[iclass].selectCols(tmpset,tmp[iclass]);
286  }
287  try{
288  cost=theCostFactory.getCost(tmp);
289  }
290  catch(...){
291  catchset=tmpset;//this tmpset resulted in singular matrix
292  if(verbose)
293  std::cout << "Could not add feature " << tmpset.back() << std::endl;
294  tmpset.pop_back();
295  continue;
296  }
297  if(maxCost<cost){ //level with better cost is found
298  maxCost=cost;
299  subset=tmpset;
300  }
301  tmpset.pop_back();
302  }
303  if(maxCost<0) //no level added to better maxCost than current subset (catchset)
304  subset=catchset;
305  //test: assert list contains no duplicate elements
306  for(std::list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){
307  std::list<int>::iterator lit2=lit;//start searching from next element
308  assert(find(++lit2,subset.end(),*lit)==subset.end());
309  }
310  return maxCost;
311 }
312 
313 template<class T> double FeatureSelector::removeFeature(std::vector< Vector2d<T> >& v, CostFactory& theCostFactory, std::list<int>& subset, int& r, short verbose){
314  //find the feature that has the least effect on the cost when it is removed from subset
315  std::list<int> tmpset=subset;//temporary set of selected features (levels)
316  std::vector< Vector2d<T> > tmp(v.size());
317  int nFeatures=subset.size();
318  std::list<int> catchset;//restore set in case of catch all the way to last level (no better cost)
319  //initialize maxCost with actual cost for current subset (-1 if empty subset)
320  double maxCost=-1;
321  int last;
322  double cost;
323  int maxLevels=v[0][0].size();
324  if(subset.size()>maxLevels||subset.empty()){
325  return maxCost;
326  }
327  std::list<int>::const_iterator it;
328  for(int i=0;i<nFeatures;++i){
329  last=tmpset.back();
330  tmpset.pop_back();
331  for(int iclass=0;iclass<v.size();++iclass){
332 // tmp[iclass].resize(v[iclass].size());
333  v[iclass].selectCols(tmpset,tmp[iclass]);
334  }
335  try{
336  cost=theCostFactory.getCost(tmp);
337  }
338  catch(...){
339  catchset=tmpset;//this tmpset resulted in singular matrix
340  if(verbose)
341  std::cout << "Could not remove feature " << last << std::endl;
342  tmpset.push_front(last);
343  continue;
344  }
345  if(maxCost<cost){ //level with better cost is found
346  maxCost=cost;
347  subset=tmpset;
348  r=last;
349  }
350  tmpset.push_front(last);
351  }
352  if(maxCost<0){//all levels removed were caught
353  subset=catchset;
354  }
355  //test: assert list contains no duplicate elements
356  for(std::list<int>::iterator lit=subset.begin();lit!=subset.end();++lit){
357  std::list<int>::iterator lit2=lit;//start searching from next element
358  assert(find(++lit2,subset.end(),*lit)==subset.end());
359  }
360  return maxCost;
361 }
362 
363 #endif /* _FEATURESELECTOR_H_ */