distbindata.cc

  1 const char *help = "\
  2 progname: distbindata.cc\n\
  3 code2html: This program computes a distance between 2 bindata files.\n\
  4 version: Torch3 vision2.0, 2004-2005\n\
  5 (c) Sebastien Marcel (marcel@idiap.ch)\n";
  6 
  7 // core
  8 #include "string_utils.h"
  9 
 10 // datasets
 11 #include "FileBinDataSet.h"
 12 
 13 // machines
 14 #include "ConnectedMachine.h"
 15 #include "Linear.h"
 16 #include "Tanh.h"
 17 #include "LogSoftMax.h"
 18 
 19 // normalisation
 20 #include "MyMeanVarNorm.h"
 21 
 22 // metrics
 23 #include "Pearson.h"
 24 #include "Canberra.h"
 25 #include "NormalizeCorrelation.h"
 26 #include "StandardCorrelation.h"
 27 #include "StandardCovariance.h"
 28 #include "ChiSquare.h"
 29 #include "TangentDistance.h"
 30 #include "Mahanalobis.h"
 31 
 32 // eigen
 33 #include "PCAMachine.h"
 34 
 35 // misc
 36 #include "CmdLine.h"
 37 #include "FileListCmdOption.h"
 38 
 39 using namespace Torch;
 40 
 41 real mMlpOneHot(int n_inputs, real *x, real *y, ConnectedMachine *mlp, MyMeanVarNorm *mv_norm, Sequence *seq, bool diff, bool delta)
 42 {
 43    	if(diff)
 44 		for(int i = 0 ; i < n_inputs ; i++) seq->frames[0][i] = x[i] - y[i];
 45 	else if(delta)
 46 	{
 47 	   	int j = 0;
 48 		for(int i = 0 ; i < n_inputs ; i++, j++) seq->frames[0][j] = x[i] - y[i];
 49 		for(int i = 0 ; i < n_inputs ; i++, j++) seq->frames[0][j] = y[i];
 50 	}
 51 	else
 52 	{
 53 	   	int j = 0;
 54 		for(int i = 0 ; i < n_inputs ; i++, j++) seq->frames[0][j] = x[i];
 55 		for(int i = 0 ; i < n_inputs ; i++, j++) seq->frames[0][j] = y[i];
 56 	}
 57 	
 58    	mv_norm->preProcessInputs(seq);
 59 	
 60 	mlp->forward(seq);
 61 	   	
 62 	return mlp->outputs->frames[0][0] - mlp->outputs->frames[0][1];
 63 }
 64 
 65 int main(int argc, char **argv)
 66 {
 67   	char *template_file;
 68   	char *model_file;
 69 	char *score_filename;
 70 	char *norm_model_filename;
 71 	bool use_mean_template;
 72 	bool verbose;
 73 	bool one_score_per_file;
 74 
 75 	//
 76 	bool mahalanobis;
 77 	bool canberra;
 78 	bool pearson;
 79 	bool nc;
 80 	bool stdcor;
 81 	bool stdcov;
 82 	bool chisquare;
 83 	bool td;
 84 	bool mlpmetric1hot;
 85 	
 86 	//
 87 	bool diff;
 88 	bool delta;
 89 	int dim;
 90 	
 91 	//
 92 	int width;
 93 	int height;
 94 
 95 	//
 96   	Allocator *allocator = new Allocator;
 97   	DiskXFile::setLittleEndianMode();
 98 
 99   	//=================== The command-line ==========================
100 	FileListCmdOption filelist("file name", "the list files or one data file");
101         filelist.isArgument(true);
102 
103   	// Construct the command line
104   	CmdLine cmd;
105 	cmd.setBOption("write log", false);
106 	
107   	// Put the help line at the beginning
108   	cmd.info(help);
109 
110   	// Train mode
111   	cmd.addText("\nArguments:");
112   	cmd.addSCmdArg("template", &template_file, "the template file to compare with");
113   	cmd.addCmdOption(&filelist);
114   	cmd.addText("\nOptions:");
115   	cmd.addBCmdOption("-verbose", &verbose, false, "verbose", true);
116   	cmd.addBCmdOption("-use_mean", &use_mean_template, false, "use the mean model", true);
117   	cmd.addBCmdOption("-one_score_per_file", &one_score_per_file, false, "computes one score per input file", true);
118   	cmd.addSCmdOption("-score", &score_filename, "", "score filename");
119   	cmd.addSCmdOption("-norm", &norm_model_filename, "", "norm model filename");
120   	cmd.addText("\nFeatures:");
121   	cmd.addBCmdOption("-diff", &diff, false, "diff input features", true);
122   	cmd.addBCmdOption("-delta", &delta, false, "delta input features", true);
123   	cmd.addICmdOption("-dim", &dim, -1, "dimension to use", true);
124   	cmd.addText("\nMetrics:");
125   	cmd.addBCmdOption("-mahalanobis", &mahalanobis, false, "Mahalanobis metric", true);
126   	cmd.addBCmdOption("-canberra", &canberra, false, "Canberra metric", true);
127   	cmd.addBCmdOption("-pearson", &pearson, false, "one minus Pearson correlation", true);
128   	cmd.addBCmdOption("-nc", &nc, false, "Normalized correlation", true);
129   	cmd.addBCmdOption("-stdcor", &stdcor, false, "Standard Correlation", true);
130   	cmd.addBCmdOption("-stdcov", &stdcov, false, "Standard Covariance", true);
131   	cmd.addBCmdOption("-chisquare", &chisquare, false, "Chi Square", true);
132   	cmd.addBCmdOption("-td", &td, false, "tangent distance", true);
133   	cmd.addICmdOption("-width", &width, -1, "width of the image for tangent distance", true);
134   	cmd.addICmdOption("-height", &height, -1, "height of the image for tangent distance", true);
135   	cmd.addBCmdOption("-mlpmetric1hot", &mlpmetric1hot, false, "mlpmetric1hot distance");
136   	cmd.addSCmdOption("-model", &model_file, "", "model filename");
137 
138   	// Read the command line
139   	cmd.read(argc, argv);
140 
141 	//
142 	if(verbose)
143 	{
144 		if(mahalanobis) print("Using Mahalanobis-cosine metric with PCA model %s\n", model_file);
145 		else if(td) print("Using Tangent distance on %dx%d images\n", width, height);
146 		else if(canberra) print("Using Canberra metric\n");
147 		else if(pearson) print("Using one minus Pearson correlation\n");
148 		else if(nc) print("Using Normalized correlation\n");
149 		else if(stdcor) print("Using Standard Correlation\n");
150 		else if(stdcov) print("Using Standard Covariance\n");
151 		else if(chisquare) print("Using Chi Square\n");
152 		else if(mlpmetric1hot) print("Using One hot MLP metric with model %s\n", model_file);
153 		else print("No metric chosen, setting to Euclidean by default\n");
154 
155 		print(" + n_filenames = %d\n", filelist.n_files);
156 		for(int i = 0 ; i < filelist.n_files ; i++)
157 			print("   filename[%d] = %s\n", i, filelist.file_names[i]);
158 	}
159 
160 
161 
162 	// load the template
163 	int n_inputs_template;
164 	int n_patterns_model;
165 	
166 	DiskXFile model(template_file, "r");
167 
168 	model.read(&n_patterns_model, sizeof(int), 1);
169 	model.read(&n_inputs_template, sizeof(int), 1);
170 
171 	if(verbose)
172 	{
173 		print(" Number of inputs = %d\n", n_inputs_template);
174 		print(" Number of reference patterns = %d\n", n_patterns_model);
175 	}
176 
177 	real **ref_model = new real*[n_patterns_model];
178 	real *mean_model = new real [n_inputs_template];
179 	for(int j=0; j< n_inputs_template; j++) mean_model[j] = 0.0;
180 	for(int p = 0 ; p < n_patterns_model ; p++)
181 	{
182 		ref_model[p] = new real [n_inputs_template];
183 		model.read(ref_model[p], sizeof(real), n_inputs_template);
184 		for(int j=0; j< n_inputs_template; j++)
185 		{
186 			mean_model[j] += ref_model[p][j];
187 		}
188 	}
189 	for(int j=0; j< n_inputs_template; j++) mean_model[j] /= (real) n_patterns_model;
190 
191 
192 	real *inputs = new real [n_inputs_template];
193 
194 
195 	// load the normalization
196 	DiskXFile *normfile = NULL;
197 	real mu = 0.0;
198 	real sigma = 1.0;
199 	if(strcmp(norm_model_filename, "") != 0)
200 	{
201 		normfile = new(allocator) DiskXFile(norm_model_filename, "r");
202 		normfile->read(&mu, sizeof(real), 1);
203 		normfile->read(&sigma, sizeof(real), 1);
204 		print("Norm model (%s): mu=%g \t sigma = %g\n", norm_model_filename, mu, sigma);
205 	}
206 	
207 	
208 	//
209 	DiskXFile *scorefile = NULL;
210 	if(strcmp(score_filename, "") != 0) scorefile = new(allocator) DiskXFile(score_filename, "w");
211 
212 
213 	// create the metric
214 	Metric *metric = NULL;
215 
216 	ConnectedMachine *mlp = NULL;
217 	MyMeanVarNorm *mv_norm = NULL;
218 	Sequence *seq;
219 	
220 	PCAMachine *pca_machine = NULL;
221 
222 	int dim_ = dim;
223 	
224 	if((dim_ == -1) || (dim_ > n_inputs_template)) dim_ = n_inputs_template;
225 
226 	if(canberra) metric = new mCanberra(dim_);
227 	else if(pearson) metric = new mPearson(dim_);
228 	else if(nc) metric = new mNC(dim_);
229 	else if(stdcor) metric = new mStdCorrelation(dim_);
230 	else if(stdcov) metric = new mStdCovariance(dim_);
231 	else if(chisquare) metric = new mChiSquare(dim_);
232 	else if(td)
233 	{
234 	   	if(width != -1 && height != -1 && width * height == n_inputs_template)
235 			metric = new mTangentDistance(width, height);
236 		else error("width(%d) or height (%d) incorrect for Tangent Distance", width, height);
237 	}
238 	else if(mahalanobis)
239 	{
240 		if(strcmp(model_file, ""))
241 		{
242 			pca_machine = new PCAMachine(n_inputs_template);
243 			DiskXFile *file = NULL;
244 			file = new DiskXFile(model_file, "r");
245 			pca_machine->loadXFile(file);
246 			delete file;
247 
248 			pca_machine->setIOption("verbose_level", 1);
249 			pca_machine->setROption("variance", -1.0);
250 			pca_machine->init();
251 			if(dim > 0) pca_machine->n_outputs = dim;
252 
253 			metric = new mMahanalobisCosine(n_inputs_template, pca_machine);
254 		}
255 		else error("No PCA model available for Mahalanobis");
256 	}
257 	else if(mlpmetric1hot)
258 	{
259 		if(strcmp(model_file, ""))
260 		{
261 			int n_inputs_;
262 			int n_hu;
263 			int n_outputs;
264 
265 			print("Loading One hot MLP metric\n");
266 		
267 			DiskXFile mlpmodel(model_file, "r");
268 			mlpmodel.taggedRead(&n_inputs_, sizeof(int), 1, "N_INPUTS");
269 			mlpmodel.taggedRead(&n_hu, sizeof(int), 1, "N_HU");
270 			mlpmodel.taggedRead(&n_outputs, sizeof(int), 1, "N_OUTPUTS");
271 
272 			print(" Number of inputs = %d\n", n_inputs_);
273 			print(" Number of hidden units = %d\n", n_hu);
274 			print(" Number of outputs = %d\n", n_outputs);
275 
276 			if(diff)
277 			{
278 		   		print("Using diff features.\n");
279 
280 				if(n_inputs_ != n_inputs_template) error("Number of inputs incorrect.");
281 			}
282 			else
283 			{
284 		   		if(delta) print("Using delta features.\n");
285 
286 				if(n_inputs_ != 2*n_inputs_template) error("Number of inputs incorrect.");
287 			}
288 			if(n_outputs != 2) error("Number of outputs incorrect.");
289 
290 			//
291 			mlp = new(allocator) ConnectedMachine;
292 			Linear *c1 = new(allocator) Linear(n_inputs_, n_hu);
293 			Tanh *c2 = new(allocator) Tanh(n_hu);
294 			Linear *c3 = new(allocator) Linear(n_hu, n_outputs);
295 			GradientMachine *c4 = new(allocator) LogSoftMax(n_outputs);
296 			mlp->addFCL(c1);    
297 			mlp->addFCL(c2);
298 			mlp->addFCL(c3);
299 			mlp->addFCL(c4);
300 			mlp->build();
301 
302 			//
303     			mv_norm = new(allocator) MyMeanVarNorm(n_inputs_, 1);
304 		
305 			//
306 			mv_norm->loadXFile(&mlpmodel);
307 			mlp->loadXFile(&mlpmodel);
308 
309 			seq = new(allocator) Sequence(1, n_inputs_);
310 		}
311 		else error("No model available");
312 	}
313 	else metric = new mEuclidean(dim_);
314 	
315 	for(int i = 0 ; i < filelist.n_files ; i++)
316 	{
317 	   	if(verbose) print(" + filename[%d] = %s\n", i, filelist.file_names[i]);
318 
319 		char *temp = strBaseName(filelist.file_names[i]);
320 		char *file_name = strRemoveSuffix(temp);
321 
322 		if(scorefile != NULL)
323 		
324 			if(one_score_per_file) scorefile->printf("%s ", file_name);
325 		
326 		int n_inputs;
327 		int n_patterns;
328 			
329 		// Test the file
330 		DiskXFile *file = new DiskXFile(filelist.file_names[i], "r");
331 
332 		file->read(&n_patterns, sizeof(int), 1);
333 		file->read(&n_inputs, sizeof(int), 1);
334 
335 		if(verbose)
336 		{
337 			print("Reading bindata file (%s)\n", filelist.file_names[i]);
338 			print("   n_inputs = %d\n", n_inputs);
339 			print("   n_patterns = %d\n", n_patterns);  
340 		}
341 		
342 		if(n_inputs != n_inputs_template)
343 			error("Incorrect number of inputs (%d <> %d) !", n_inputs, n_inputs_template);
344 			
345 		real min_ = +1000.0;
346 		real max_ = -1000.0;
347 		real sum_ = 0.0;
348 		real avg;
349 
350 		for(int j=0; j< n_patterns; j++)
351 		{
352 			if(!one_score_per_file) scorefile->printf("%s_%03d ", file_name, j);
353 			
354 			file->read(inputs, sizeof(real), n_inputs);
355 		
356 			real d = 0.0;
357 		
358 			if(use_mean_template)
359 			{
360 				if(mlpmetric1hot) d = -mMlpOneHot(n_inputs, inputs, mean_model, mlp, mv_norm, seq, diff, delta);
361 				else d = metric->measure(inputs, mean_model);
362 			}
363 			else
364 			{
365 				for(int p = 0 ; p < n_patterns_model ; p++)
366 				{
367 					//if(strcmp(mlpmetric1hot, "")) d = -mMlpOneHot(n_inputs, inputs, ref_model[p], mlp, mv_norm, seq, diff, delta);
368 					//else d += mEuclidean(n_inputs, inputs, ref_model[p]);
369 					if(mlpmetric1hot) d += -mMlpOneHot(n_inputs, inputs, ref_model[p], mlp, mv_norm, seq, diff, delta);
370 					else d += metric->measure(inputs, ref_model[p]);
371 				}
372 			
373 				d /= (real) n_patterns_model;
374 			}
375 
376 		   	if(!one_score_per_file)
377 			{
378 			   	real z = -d;
379 	
380 				if(strcmp(norm_model_filename, "") != 0)
381 				{
382 					z -= mu;
383 					z /= sigma;
384 				}	
385 
386 	 			scorefile->printf("%g\n",  z);
387 			}
388 			
389 			sum_ += d;
390 	
391 			//
392 			if(d < min_) min_ = d;
393 			if(d > max_) max_ = d;
394 		}
395 
396 		avg = sum_/(real)n_patterns;
397 
398 		//
399 		if(verbose)
400 		{
401 			print("Outputs:\n");
402 			print("  min = %g\n", min_);
403 			print("  max = %g\n", max_);
404 			print("  sum = %g\n", sum_);
405 			print("  avg = %g\n", avg);
406 		}
407 
408 		if(scorefile != NULL)
409 		{
410 		   	if(one_score_per_file)
411 			{
412 			   	real z = -avg;
413 	
414 				if(strcmp(norm_model_filename, "") != 0)
415 				{
416 					z -= mu;
417 					z /= sigma;
418 				}	
419 
420 	 			scorefile->printf("%g ",  z);
421 			}
422 		}
423 		
424 		//
425 		delete file;
426 
427 		//
428 		if(scorefile != NULL)
429 		   	if(one_score_per_file) scorefile->printf("\n");
430 	}
431 	
432 	//
433 	for(int p = 0 ; p < n_patterns_model ; p++) delete [] ref_model[p];
434 	delete [] ref_model;
435 
436 	delete [] inputs;
437 
438 	delete metric;
439 
440 	//
441   	delete allocator;
442 
443   	return(0);
444 }