]> git.jsancho.org Git - lugaru.git/blob - libvorbis-1.0.1/vq/latticepare.c
Simplified web browser launching on Linux.
[lugaru.git] / libvorbis-1.0.1 / vq / latticepare.c
1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001             *
9  * by the XIPHOPHORUS Company http://www.xiph.org/                  *
10  *                                                                  *
11  ********************************************************************
12
13  function: utility for paring low hit count cells from lattice codebook
14  last mod: $Id: latticepare.c,v 1.11 2001/12/20 01:00:39 segher Exp $
15
16  ********************************************************************/
17
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <math.h>
21 #include <string.h>
22 #include <errno.h>
23 #include "../lib/scales.h"
24 #include "bookutil.h"
25 #include "vqgen.h"
26 #include "vqsplit.h"
27 #include "../lib/os.h"
28
29 /* Lattice codebooks have two strengths: important fetaures that are
30    poorly modelled by global error minimization training (eg, strong
31    peaks) are not neglected 2) compact quantized representation.
32
33    A fully populated lattice codebook, however, swings point 1 too far
34    in the opposite direction; rare features need not be modelled quite
35    so religiously and as such, we waste bits unless we eliminate the
36    least common cells.  The codebook rep supports unused cells, so we
37    need to tag such cells and build an auxiliary (non-thresh) search
38    mechanism to find the proper match quickly */
39
40 /* two basic steps; first is pare the cell for which dispersal creates
41    the least additional error.  This will naturally choose
42    low-population cells and cells that have not taken on points from
43    neighboring paring (but does not result in the lattice collapsing
44    inward and leaving low population ares totally unmodelled).  After
45    paring has removed the desired number of cells, we need to build an
46    auxiliary search for each culled point */
47
48 /* Although lattice books (due to threshhold-based matching) do not
49    actually use error to make cell selections (in fact, it need not
50    bear any relation), the 'secondbest' entry finder here is in fact
51    error/distance based, so latticepare is only useful on such books */
52
53 /* command line:
54    latticepare latticebook.vqh input_data.vqd <target_cells>
55
56    produces a new output book on stdout 
57 */
58
59 static float _dist(int el,float *a, float *b){
60   int i;
61   float acc=0.f;
62   for(i=0;i<el;i++){
63     float val=(a[i]-b[i]);
64     acc+=val*val;
65   }
66   return(acc);
67 }
68
69 static float *pointlist;
70 static long points=0;
71
72 void add_vector(codebook *b,float *vec,long n){
73   int dim=b->dim,i,j;
74   int step=n/dim;
75   for(i=0;i<step;i++){
76     for(j=i;j<n;j+=step){
77       pointlist[points++]=vec[j];
78     }
79   }
80 }
81
82 static int bestm(codebook *b,float *vec){
83   encode_aux_threshmatch *tt=b->c->thresh_tree;
84   int dim=b->dim;
85   int i,k,o;
86   int best=0;
87   
88   /* what would be the closest match if the codebook was fully
89      populated? */
90   
91   for(k=0,o=dim-1;k<dim;k++,o--){
92     int i;
93     for(i=0;i<tt->threshvals-1;i++)
94       if(vec[o]<tt->quantthresh[i])break;
95     best=(best*tt->quantvals)+tt->quantmap[i];
96   }
97   return(best);
98 }
99
100 static int closest(codebook *b,float *vec,int current){
101   encode_aux_threshmatch *tt=b->c->thresh_tree;
102   int dim=b->dim;
103   int i,k,o;
104
105   float bestmetric=0;
106   int bestentry=-1;
107   int best=bestm(b,vec);
108
109   if(current<0 && b->c->lengthlist[best]>0)return best;
110
111   for(i=0;i<b->entries;i++){
112     if(b->c->lengthlist[i]>0 && i!=best && i!=current){
113       float thismetric=_dist(dim, vec, b->valuelist+i*dim);
114       if(bestentry==-1 || thismetric<bestmetric){
115         bestentry=i;
116         bestmetric=thismetric;
117       }
118     }
119   }
120
121   return(bestentry);
122 }
123
124 static float _heuristic(codebook *b,float *ppt,int secondbest){
125   float *secondcell=b->valuelist+secondbest*b->dim;
126   int best=bestm(b,ppt);
127   float *firstcell=b->valuelist+best*b->dim;
128   float error=_dist(b->dim,firstcell,secondcell);
129   float *zero=alloca(b->dim*sizeof(float));
130   float fromzero;
131   
132   memset(zero,0,b->dim*sizeof(float));
133   fromzero=sqrt(_dist(b->dim,firstcell,zero));
134
135   return(error/fromzero);
136 }
137
138 static int longsort(const void *a, const void *b){
139   return **(long **)b-**(long **)a;
140 }
141
142 void usage(void){
143   fprintf(stderr,"Ogg/Vorbis lattice codebook paring utility\n\n"
144           "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n"
145           "where <target_cells> is the desired number of final cells (or -1\n"
146           "                     for no change)\n"
147           "      <protected_cells> is the number of highest-hit count cells\n"
148           "                     to protect from dispersal\n"
149           "      base is the base name (not including .vqh) of the new\n"
150           "                     book\n\n");
151   exit(1);
152 }
153
154 int main(int argc,char *argv[]){
155   char *basename;
156   codebook *b=NULL;
157   int entries=0;
158   int dim=0;
159   long i,j,target=-1,protect=-1;
160   FILE *out=NULL;
161
162   int argnum=0;
163
164   argv++;
165   if(*argv==NULL){
166     usage();
167     exit(1);
168   }
169
170   while(*argv){
171     if(*argv[0]=='-'){
172
173       argv++;
174         
175     }else{
176       switch (argnum++){
177       case 0:case 1:
178         {
179           /* yes, this is evil.  However, it's very convenient to parse file
180              extentions */
181           
182           /* input file.  What kind? */
183           char *dot;
184           char *ext=NULL;
185           char *name=strdup(*argv++);
186           dot=strrchr(name,'.');
187           if(dot)
188             ext=dot+1;
189           else{
190             ext="";
191             
192           }
193           
194           
195           /* codebook */
196           if(!strcmp(ext,"vqh")){
197             
198             basename=strrchr(name,'/');
199             if(basename)
200               basename=strdup(basename)+1;
201             else
202               basename=strdup(name);
203             dot=strrchr(basename,'.');
204             if(dot)*dot='\0';
205             
206             b=codebook_load(name);
207             dim=b->dim;
208             entries=b->entries;
209           }
210           
211           /* data file; we do actually need to suck it into memory */
212           /* we're dealing with just one book, so we can de-interleave */ 
213           if(!strcmp(ext,"vqd") && !points){
214             int cols;
215             long lines=0;
216             char *line;
217             float *vec;
218             FILE *in=fopen(name,"r");
219             if(!in){
220               fprintf(stderr,"Could not open input file %s\n",name);
221               exit(1);
222             }
223             
224             reset_next_value();
225             line=setup_line(in);
226             /* count cols before we start reading */
227             {
228               char *temp=line;
229               while(*temp==' ')temp++;
230               for(cols=0;*temp;cols++){
231                 while(*temp>32)temp++;
232                 while(*temp==' ')temp++;
233               }
234             }
235             vec=alloca(cols*sizeof(float));
236             /* count, then load, to avoid fragmenting the hell out of
237                memory */
238             while(line){
239               lines++;
240               for(j=0;j<cols;j++)
241                 if(get_line_value(in,vec+j)){
242                   fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
243                   exit(1);
244                 }
245               if((lines&0xff)==0)spinnit("counting samples...",lines*cols);
246               line=setup_line(in);
247             }
248             pointlist=_ogg_malloc((cols*lines+entries*dim)*sizeof(float));
249             
250             rewind(in);
251             line=setup_line(in);
252             while(line){
253               lines--;
254               for(j=0;j<cols;j++)
255                 if(get_line_value(in,vec+j)){
256                   fprintf(stderr,"Too few columns on line %ld in data file\n",lines);
257                   exit(1);
258                 }
259               /* deinterleave, add to heap */
260               add_vector(b,vec,cols);
261               if((lines&0xff)==0)spinnit("loading samples...",lines*cols);
262               
263               line=setup_line(in);
264             }
265             fclose(in);
266           }
267         }
268         break;
269       case 2:
270         target=atol(*argv++);
271         if(target==0)target=entries;
272         break;
273       case 3:
274         protect=atol(*argv++);
275         break;
276       case 4:
277         {
278           char *buff=alloca(strlen(*argv)+5);
279           sprintf(buff,"%s.vqh",*argv);
280           basename=*argv++;
281
282           out=fopen(buff,"w");
283           if(!out){
284             fprintf(stderr,"unable ot open %s for output",buff);
285             exit(1);
286           }
287         }
288         break;
289       default:
290         usage();
291       }
292     }
293   }
294   if(!entries || !points || !out)usage();
295   if(target==-1)usage();
296
297   /* add guard points */
298   for(i=0;i<entries;i++)
299     for(j=0;j<dim;j++)
300       pointlist[points++]=b->valuelist[i*dim+j];
301   
302   points/=dim;
303
304   /* set up auxiliary vectors for error tracking */
305   {
306     encode_aux_nearestmatch *nt=NULL;
307     long pointssofar=0;
308     long *pointindex;
309     long indexedpoints=0;
310     long *entryindex;
311     long *reventry;
312     long *membership=_ogg_malloc(points*sizeof(long));
313     long *firsthead=_ogg_malloc(entries*sizeof(long));
314     long *secondary=_ogg_malloc(points*sizeof(long));
315     long *secondhead=_ogg_malloc(entries*sizeof(long));
316
317     long *cellcount=_ogg_calloc(entries,sizeof(long));
318     long *cellcount2=_ogg_calloc(entries,sizeof(long));
319     float *cellerror=_ogg_calloc(entries,sizeof(float));
320     float *cellerrormax=_ogg_calloc(entries,sizeof(float));
321     long cellsleft=entries;
322     for(i=0;i<points;i++)membership[i]=-1;
323     for(i=0;i<entries;i++)firsthead[i]=-1;
324     for(i=0;i<points;i++)secondary[i]=-1;
325     for(i=0;i<entries;i++)secondhead[i]=-1;
326
327     for(i=0;i<points;i++){
328       /* assign vectors to the nearest cell.  Also keep track of second
329          nearest for error statistics */
330       float *ppt=pointlist+i*dim;
331       int    firstentry=closest(b,ppt,-1);
332       int    secondentry=closest(b,ppt,firstentry);
333       float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
334       float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
335       
336       if(!(i&0xff))spinnit("initializing... ",points-i);
337     
338       membership[i]=firsthead[firstentry];
339       firsthead[firstentry]=i;
340       secondary[i]=secondhead[secondentry];
341       secondhead[secondentry]=i;
342
343       if(i<points-entries){
344         cellerror[firstentry]+=secondmetric-firstmetric;
345         cellerrormax[firstentry]=max(cellerrormax[firstentry],
346                                      _heuristic(b,ppt,secondentry));
347         cellcount[firstentry]++;
348         cellcount2[secondentry]++;
349       }
350     }
351
352     /* which cells are most heavily populated?  Protect as many from
353        dispersal as the user has requested */
354     {
355       long **countindex=_ogg_calloc(entries,sizeof(long *));
356       for(i=0;i<entries;i++)countindex[i]=cellcount+i;
357       qsort(countindex,entries,sizeof(long *),longsort);
358       for(i=0;i<protect;i++){
359         int ptr=countindex[i]-cellcount;
360         cellerrormax[ptr]=9e50f;
361       }
362     }
363
364     {
365       fprintf(stderr,"\r");
366       for(i=0;i<entries;i++){
367         /* decompose index */
368         int entry=i;
369         for(j=0;j<dim;j++){
370           fprintf(stderr,"%d:",entry%b->c->thresh_tree->quantvals);
371           entry/=b->c->thresh_tree->quantvals;
372         }
373         
374         fprintf(stderr,":%ld/%ld, ",cellcount[i],cellcount2[i]);
375       }
376       fprintf(stderr,"\n");
377     }
378
379     /* do the automatic cull request */
380     while(cellsleft>target){
381       int bestcell=-1;
382       float besterror=0;
383       float besterror2=0;
384       long head=-1;
385       char spinbuf[80];
386       sprintf(spinbuf,"cells left to eliminate: %ld : ",cellsleft-target);
387
388       /* find the cell with lowest removal impact */
389       for(i=0;i<entries;i++){
390         if(b->c->lengthlist[i]>0){
391           if(bestcell==-1 || cellerrormax[i]<=besterror2){
392             if(bestcell==-1 || cellerrormax[i]<besterror2 || 
393                besterror>cellerror[i]){
394               besterror=cellerror[i];
395               besterror2=cellerrormax[i];
396               bestcell=i;
397             }
398           }
399         }
400       }
401
402       fprintf(stderr,"\reliminating cell %d                              \n"
403               "     dispersal error of %g max/%g total (%ld hits)\n",
404               bestcell,besterror2,besterror,cellcount[bestcell]);
405
406       /* disperse it.  move each point out, adding it (properly) to
407          the second best */
408       b->c->lengthlist[bestcell]=0;
409       head=firsthead[bestcell];
410       firsthead[bestcell]=-1;
411       while(head!=-1){
412         /* head is a point number */
413         float *ppt=pointlist+head*dim;
414         int firstentry=closest(b,ppt,-1);
415         int secondentry=closest(b,ppt,firstentry);
416         float firstmetric=_dist(dim,b->valuelist+dim*firstentry,ppt);
417         float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
418         long next=membership[head];
419
420         if(head<points-entries){
421           cellcount[firstentry]++;
422           cellcount[bestcell]--;
423           cellerror[firstentry]+=secondmetric-firstmetric;
424           cellerrormax[firstentry]=max(cellerrormax[firstentry],
425                                        _heuristic(b,ppt,secondentry));
426         }
427
428         membership[head]=firsthead[firstentry];
429         firsthead[firstentry]=head;
430         head=next;
431         if(cellcount[bestcell]%128==0)
432           spinnit(spinbuf,cellcount[bestcell]+cellcount2[bestcell]);
433
434       }
435
436       /* now see that all points that had the dispersed cell as second
437          choice have second choice reassigned */
438       head=secondhead[bestcell];
439       secondhead[bestcell]=-1;
440       while(head!=-1){
441         float *ppt=pointlist+head*dim;
442         /* who are we assigned to now? */
443         int firstentry=closest(b,ppt,-1);
444         /* what is the new second closest match? */
445         int secondentry=closest(b,ppt,firstentry);
446         /* old second closest is the cell being disbanded */
447         float oldsecondmetric=_dist(dim,b->valuelist+dim*bestcell,ppt);
448         /* new second closest error */
449         float secondmetric=_dist(dim,b->valuelist+dim*secondentry,ppt);
450         long next=secondary[head];
451
452         if(head<points-entries){
453           cellcount2[secondentry]++;
454           cellcount2[bestcell]--;
455           cellerror[firstentry]+=secondmetric-oldsecondmetric;
456           cellerrormax[firstentry]=max(cellerrormax[firstentry],
457                                        _heuristic(b,ppt,secondentry));
458         }
459         
460         secondary[head]=secondhead[secondentry];
461         secondhead[secondentry]=head;
462         head=next;
463
464         if(cellcount2[bestcell]%128==0)
465           spinnit(spinbuf,cellcount2[bestcell]);
466       }
467
468       cellsleft--;
469     }
470
471     /* paring is over.  Build decision trees using points that now fall
472        through the thresh matcher. */
473     /* we don't free membership; we flatten it in order to use in lp_split */
474
475     for(i=0;i<entries;i++){
476       long head=firsthead[i];
477       spinnit("rearranging membership cache... ",entries-i);
478       while(head!=-1){
479         long next=membership[head];
480         membership[head]=i;
481         head=next;
482       }
483     }
484
485     free(secondhead);
486     free(firsthead);
487     free(cellerror);
488     free(cellerrormax);
489     free(secondary);
490
491     pointindex=_ogg_malloc(points*sizeof(long));
492     /* make a point index of fall-through points */
493     for(i=0;i<points;i++){
494       int best=_best(b,pointlist+i*dim,1);
495       if(best==-1)
496         pointindex[indexedpoints++]=i;
497       spinnit("finding orphaned points... ",points-i);
498     }
499
500     /* make an entry index */
501     entryindex=_ogg_malloc(entries*sizeof(long));
502     target=0;
503     for(i=0;i<entries;i++){
504       if(b->c->lengthlist[i]>0)
505         entryindex[target++]=i;
506     }
507
508     /* make working space for a reverse entry index */
509     reventry=_ogg_malloc(entries*sizeof(long));
510
511     /* do the split */
512     nt=b->c->nearest_tree=
513       _ogg_calloc(1,sizeof(encode_aux_nearestmatch));
514
515     nt->alloc=4096;
516     nt->ptr0=_ogg_malloc(sizeof(long)*nt->alloc);
517     nt->ptr1=_ogg_malloc(sizeof(long)*nt->alloc);
518     nt->p=_ogg_malloc(sizeof(long)*nt->alloc);
519     nt->q=_ogg_malloc(sizeof(long)*nt->alloc);
520     nt->aux=0;
521
522     fprintf(stderr,"Leaves added: %d              \n",
523             lp_split(pointlist,points,
524                      b,entryindex,target,
525                      pointindex,indexedpoints,
526                      membership,reventry,
527                      0,&pointssofar));
528     free(membership);
529     free(reventry);
530     free(pointindex);
531
532     /* hack alert.  I should just change the damned splitter and
533        codebook writer */
534     for(i=0;i<nt->aux;i++)nt->p[i]*=dim;
535     for(i=0;i<nt->aux;i++)nt->q[i]*=dim;
536     
537     /* recount hits.  Build new lengthlist. reuse entryindex storage */
538     for(i=0;i<entries;i++)entryindex[i]=1;
539     for(i=0;i<points-entries;i++){
540       int best=_best(b,pointlist+i*dim,1);
541       float *a=pointlist+i*dim;
542       if(!(i&0xff))spinnit("counting hits...",i);
543       if(best==-1){
544         fprintf(stderr,"\nINTERNAL ERROR; a point count not be matched to a\n"
545                 "codebook entry.  The new decision tree is broken.\n");
546         exit(1);
547       }
548       entryindex[best]++;
549     }
550     for(i=0;i<nt->aux;i++)nt->p[i]/=dim;
551     for(i=0;i<nt->aux;i++)nt->q[i]/=dim;
552     
553     /* the lengthlist builder doesn't actually deal with 0 hit entries.
554        So, we pack the 'sparse' hit list into a dense list, then unpack
555        the lengths after the build */
556     {
557       int upper=0;
558       long *lengthlist=_ogg_calloc(entries,sizeof(long));
559       for(i=0;i<entries;i++){
560         if(b->c->lengthlist[i]>0)
561           entryindex[upper++]=entryindex[i];
562         else{
563           if(entryindex[i]>1){
564             fprintf(stderr,"\nINTERNAL ERROR; _best matched to unused entry\n");
565             exit(1);
566           }
567         }
568       }
569       
570       /* sanity check */
571       if(upper != target){
572         fprintf(stderr,"\nINTERNAL ERROR; packed the wrong number of entries\n");
573         exit(1);
574       }
575     
576       build_tree_from_lengths(upper,entryindex,lengthlist);
577       
578       upper=0;
579       for(i=0;i<entries;i++){
580         if(b->c->lengthlist[i]>0)
581           b->c->lengthlist[i]=lengthlist[upper++];
582       }
583
584     }
585   }
586   /* we're done.  write it out. */
587   write_codebook(out,basename,b->c);
588
589   fprintf(stderr,"\r                                        \nDone.\n");
590   return(0);
591 }
592
593
594
595