悲催的科学匠人 - 冷水's blog

kmean算法实现

具体描述见machine learning in action

main函数是一个二维点cluster测试

 

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
struct KMEAN{
int nvec,dim,ngroup;
double *data,     /*  [nvec][dim] */
       *mu,       /*  [ngroup][dim] */
       *scale, *min;    /*  [dim] */
int  *groups,     /* nvec */
     *count;      /* ngroup] */
};


struct KMEAN* KMEAN_init(int nvec, int dim, int ngroup)
{
  struct KMEAN* kmean = (struct KMEAN*) calloc(1,sizeof(struct KMEAN));
  kmean->nvec = nvec;
  kmean->dim  = dim;
  kmean->ngroup = ngroup;
  
  kmean->data  = (double*) calloc(nvec*dim, sizeof(double));
  kmean->mu    = (double*) calloc(ngroup*dim, sizeof(double));
  kmean->scale = (double*) calloc(dim, sizeof(double));
  kmean->min   = (double*) calloc(dim, sizeof(double));
  //kmean->max   = (double*) calloc(dim, sizeof(double));
  kmean->groups = (int*) calloc(nvec, sizeof(int));
  //kmean->mark   = (int*) calloc(nvec, sizeof(int));
  kmean->count  = (int*) calloc(ngroup, sizeof(int));
}



void KMEAN_free(struct KMEAN* kmean)
{
  free(kmean->data);
  free(kmean->mu);
  free(kmean->min);
  //free(kmean->max);
  free(kmean->scale);
  free(kmean->groups);
  //free(kmean->mark);
  free(kmean->count);
}

inline static double dataij(struct KMEAN* kmean, int i, int j)
{
   return kmean->data[i*kmean->dim + j];
}

inline static double muij(struct KMEAN *kmean, int i, int j)
{
  return kmean->mu[i*kmean->dim + j];
}

inline static void setdataij(struct KMEAN* kmean, int i, int j, double v)
{
   kmean->data[i*kmean->dim + j] = v;
}

inline static void setmuij(struct KMEAN *kmean, int i, int j, double v)
{
  kmean->mu[i*kmean->dim + j] = v;
}

inline static void addmuij(struct KMEAN *kmean, int i, int j, double v)
{
  kmean->mu[i*kmean->dim + j] += v;
}



void KMEAN_OutputPlt(struct KMEAN* kmean, const char* fname)
{
   FILE *fp = fopen(fname,"w");
   int n,d;
   if(fp==NULL) {puts("Can not open file for outputing tecplot"); return;}
   
   fprintf(fp,"ZONE T=\"DATA-%s\"\n", fname);
   for(n=0;n<kmean->nvec;n++)
   {
      for(d=0;d<kmean->dim;d++)
        fprintf(fp,"%11.4E ", dataij(kmean,n,d));
      fprintf(fp,"%d\n", kmean->groups[n] );
    }
   fprintf(fp,"ZONE T=\"MU-%s\"\n", fname);
   for(n=0;n<kmean->ngroup;n++)
   {
      for(d=0;d<kmean->dim;d++)
        fprintf(fp,"%11.4E ", muij(kmean,n,d));
      fprintf(fp,"%d\n", n );
    }
   
   fclose(fp);
}

void KMEAN_CalcScale(struct KMEAN *kmean)
{
  int d,n;
  double dmax,dmin;
  
  // compute range of each dim
  for(d=0;d<kmean->dim;d++){
    dmax = dmin = dataij(kmean,0,d);
    for(n=1;n<kmean->nvec;n++){
      double x;
      x = dataij(kmean,n,d);
      if( dmax<x )  dmax = x;
      if( dmin>x )  dmin = x;
    }    
    kmean->scale[d] = dmax - dmin;
    kmean->min[d] = dmin;
    //kmean->max[d] = dmax;
    //printf("Scaling: DIM %4d min=%11.4E max=%11.4E\n",d,dmin,dmax);
  }
  
  // randomly init mu
  for(n=0;n<kmean->ngroup;n++){
    //printf("init mu Group %3d ",n);
    for(d=0;d<kmean->dim;d++){
      setmuij(kmean, n,d,   kmean->min[d] + 0.8*(rand()/(double)(RAND_MAX)) * kmean->scale[d] );
      
      //printf("x_%3d=%11.4E ",d,muij(kmean,n,d));
    }
    //puts("");
  }
  
  for(n=0;n<kmean->nvec;n++) kmean->groups[n] = 0;
  for(d=0;d<kmean->dim;d++)
    kmean->scale[d] = 1.0/ (kmean->scale[d] * kmean->scale[d]);
}

static int WhichGroup(struct KMEAN* kmean, int i)
{
   int g;
   double mindist;
   int mingroup;
   
   
   for(g=0;g<kmean->ngroup;g++){
     double dist=0;
     int d;
     for(d=0;d<kmean->dim;d++){
       double xid,mgd;
       xid =  dataij(kmean,i,d);
       mgd = muij(kmean,g,d);
       dist += (xid-mgd) * (xid-mgd) * kmean->scale[d];
     }
     dist = sqrt(dist);
     if(g==0) {
       mindist =  dist; mingroup = 0;
     }else if(mindist>dist){
       mindist =  dist; mingroup = g;
     }
   }
   return mingroup;
}


static double KMEAN_error(struct KMEAN* kmean)
{
   double err=0.0;
   int n;
   for(n=0;n<kmean->nvec;n++)
   {
     int d,g;
     g = kmean->groups[n];
     for(d=0;d<kmean->dim;d++)
     {
        double dd = dataij(kmean,n,d) - muij(kmean,g,d);
        err += dd*dd*kmean->scale[d];
     }
   }
   return err;
}


static void UpdateMu(struct KMEAN* kmean)
{
   int n,d;
   for(n=0;n<kmean->ngroup;n++) 
   for(d=0;d<kmean->dim;d++)
      setmuij(kmean,n,d,0.0);
      
   for(n=0;n<kmean->nvec;n++)
   for(d=0;d<kmean->dim;d++){
     addmuij(kmean, kmean->groups[n], d, dataij(kmean,n,d) );
   }
   
   for(n=0;n<kmean->ngroup;n++) 
   for(d=0;d<kmean->dim;d++)
      setmuij(kmean,n,d,   muij(kmean,n,d)/kmean->count[n] );
}

static void KMEAN_sweep(struct KMEAN* kmean, double *err, int *changed)
{
   int n;
   
   *changed = 0;
   for(n=0;n<kmean->ngroup;n++)  kmean->count[n] = 0;
   
   for(n=0;n<kmean->nvec;n++){
     int g;
     g = WhichGroup(kmean,n);
     if(kmean->groups[n] != g) (*changed)++;
     kmean->groups[n] = g;
     kmean->count[ kmean->groups[n] ] ++;
     //printf("Vec %3d is Group %3d\n",n,g);
   }
   UpdateMu(kmean);
   *err =  KMEAN_error(kmean);
   
}

double KMEAN_cluster(struct KMEAN *kmean)
{
   int nchanged,it=0;
   double err;
   
   KMEAN_CalcScale(kmean);
   do{
     KMEAN_sweep(kmean, &err, &nchanged);
     //printf("%6d changed %6d  err=%11.4E\n",it++,nchanged, err);
   }while(nchanged>0);
}


void KMEAN_bisect(struct KMEAN *kmean, double *err)
{
   
   int g, worstg;
   double err0,minderr;
   char fname[1024];
   err0 = *err;
   minderr = err0;
   
   
   sprintf(fname,"skmean-g%d.plt",kmean->ngroup);
   KMEAN_OutputPlt(kmean, fname);
   
   // 找到最烂的分组
   for(g=0;g<kmean->ngroup;g++)
   {
      struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2);
      int i,si=0;
      double cerr;
      // fill group[g] into subset
      for(i=0;i<kmean->nvec;i++){
        if(kmean->groups[i]==g){
          int d;
          for(d=0;d<kmean->dim;d++)  setdataij(subset,si,d,  dataij(kmean,i,d));
          //subset->mark[si] = kmean->mark[i];
          si++;
        }
      }
      
      cerr = KMEAN_cluster(subset);
      KMEAN_free(subset);
      if(minderr > err0-cerr) {minderr = err0-cerr; worstg = g;}
   }
   
   g = worstg;   // 最烂的分组
   {
      struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2);
      int i,si=0,d;
      double cerr;
      // fill group[g] into subset
      for(i=0;i<kmean->nvec;i++){
        if(kmean->groups[i]==g){
          for(d=0;d<kmean->dim;d++)  setdataij(subset,si,d,  dataij(kmean,i,d));
          //subset->mark[si] = kmean->mark[i];
          si++;
        }
      }
      // 更新mu
      for(d=0;d<kmean->dim;d++) {
         setmuij(kmean, g, d,  muij(subset, 0, d) );
         setmuij(kmean, kmean->ngroup, d,  muij(subset, 1, d) );
      }
      kmean->ngroup++;
      KMEAN_free(subset);
      
      // 重新对kmean做一次分组
      *err = KMEAN_cluster(kmean);
   }
   
   return;
}



void KMEAN_BISECT(struct KMEAN* kmean)
{
    int  it=0,dest_ngroup = kmean->ngroup;
    double err;
    
    if(dest_ngroup<2) return;
    
    kmean->ngroup = 2;
    err = KMEAN_cluster(kmean);
    
    do{
       KMEAN_bisect(kmean, &err);
       printf("Step %6d err=%11.4E\n",++it, err);
    }while(kmean->ngroup<dest_ngroup);    
}








int main(void)
{
    struct KMEAN *kmean = KMEAN_init(80, 2, 4);
    double err;
    
    {
      FILE* fp = fopen("testSet.txt","r");
      int n=0;
      for(n=0;n<80;n++){
         double x,y;
         fscanf(fp,"%lf%lf", &x,&y);
         setdataij(kmean,n,0, x);
         setdataij(kmean,n,1, y);
      }
      fclose(fp);
    }
    /*  下面这一行是采用普通kmean */
    //err = KMEAN_cluster(kmean);  printf("err=%11.4E\n", err);
    
    /*  下面这一行是采用二分kmean */
    KMEAN_BISECT(kmean);
    
    KMEAN_OutputPlt(kmean, "skmean-g4.plt");
    
    KMEAN_free(kmean);

}

KNN算法的实现

具体描述见machine learning in action

knn.h

/* knn.c */
struct KNN *KNNInit(int nvectors, int dim, int ngroup, int *ierr);
void KNNFree(struct KNN *knn, int *ierr);
double *KNNGetdata(struct KNN *knn, int ivec, int *ierr);
double KNNGetCoord(struct KNN *knn, int ivec, int icoord, int *ierr);
void KNNSetGroup(struct KNN *knn, int ivec, int group, int *ierr);
void KNNSetDataGroup(struct KNN *knn, int ivec, double *data, int group, int *ierr);
void KNNCalcScale(struct KNN *knn, int *ierr);
double KNNCalcDistance(struct KNN *knn, int ivec, double *mydata, int *ierr);
int KNNCalcGroup(struct KNN *knn, double *mydata, int k, int *ierr);

knn.c

#include <stdlib.h>
#include <math.h>
#include <stdlib.h>

struct KNN{
  int nvec;
  int dim;
  double *data;
  double *scale;
  int ngroups;
  int *group;   /* group ID (0,1,..., ngroups-1) for each vec*/
};



inline int KNNGetDataOffset(struct KNN *knn, int ivec)
{
  return ivec*knn->dim;
}

inline int KNNCheckIvec(struct KNN* knn, int ivec)
{
    if( ivec<0 || 1+ivec>knn->nvec) 
      return 1;
    else
      return 0;
}


struct KNN*  KNNInit(int nvectors, int dim, int ngroup, int *ierr)
{
  double  *data, *scale;
  int *group,  i;

  if(nvectors<1) {*ierr = 1; puts("KNNInit, error, nvectors<1");  return NULL; }
  if(dim<1) {*ierr= 2; puts("KNNInit, error, dim<1"); return NULL; }
  if(ngroup<2) {*ierr=4; puts("KNNInit, error, ngroup<2"); return;  } 

  data = (double*) calloc(nvectors*dim, sizeof(double));
  if (data==NULL) {*ierr = 5; puts("KNNInit, error, calloc knn->data fail"); return; }

  group = (int*) calloc(nvectors, sizeof(int));
  if(group==NULL) {*ierr=6; puts("KNNInit, error, calloc knn->group fail"); return;}
  for(i=0;i<nvectors;i++) group[i] = -1;

  scale = (double*) calloc(dim, sizeof(double));
  if(scale==NULL) {*ierr=7; puts("KNNInit, error, calloc knn->scale fail"); return;}
  
  
  struct KNN* knn = (struct KNN*)calloc(1,sizeof(struct KNN));
  if(knn==NULL) {*ierr=999; puts("KNNInit, error, calloc KNN fail"); return NULL;}

  knn->data = data;
  knn->group = group;
  knn->scale = scale;

  knn->nvec= nvectors;
  knn->dim = dim;
  knn->ngroups = ngroup;

  ierr = 0;
  return knn;
}

void KNNFree(struct KNN* knn, int* ierr)
{
  if(knn==NULL) return;
  if(knn->data) free(knn->data);
  if(knn->group) free(knn->group);
  free(knn);
}



double* KNNGetdata(struct KNN *knn, int ivec, int *ierr)
{
   *ierr = KNNCheckIvec(knn,ivec);
   if(*ierr) {puts("KNNGetData, irror, Wrong ivec");  return;}

   return knn->data + KNNGetDataOffset(knn, ivec);
}


double KNNGetCoord(struct KNN* knn, int ivec, int icoord, int* ierr)
{
  double *data;
  data = KNNGetdata(knn, ivec, ierr);
  if(*ierr) return 0;
  if(icoord<0 || icoord>=knn->dim) {
     puts("KNNGetCoord, error, wrong icoord"); *ierr= 2; return 0;
  }
  *ierr = 0;
  return data[icoord];
}



void KNNSetGroup(struct KNN* knn, int ivec, int group, int* ierr)
{
  *ierr =  KNNCheckIvec(knn,ivec);  if(*ierr) {puts("KNNSetGroup, error, Wrong ivec");  return;}
  *ierr = group<0 || group>=knn->ngroups;   if(*ierr) {puts("KNNSetGroup, error, Wrong group ID (should be [0,..,ngroups-1])");  return;}

  knn->group[ivec] = group;
  *ierr = 0;
  return; 
}


void KNNSetDataGroup(struct KNN *knn,int ivec, double *data, int group, int* ierr)
{
   int i;
   double *knndata = KNNGetdata(knn, ivec, ierr);
   if(*ierr) return;

   for(i=0; i<knn->dim; i++)  knndata[i] = data[i];

   KNNSetGroup(knn, ivec, group, ierr);
   if(*ierr) return;

   *ierr = 0;
   return;
}

static int KNNIsDataFilled(struct KNN* knn)
{
   int i;
   for(i=0; i<knn->nvec; i++) if(knn->group[i]==-1)  return 0;
   return 1;
}

void KNNCalcScale(struct KNN* knn, int *ierr)
{
   int i,d;
   if(!KNNIsDataFilled(knn)) {  puts("KNNCalcScale, error, data is not filled");  return; }

   for(d=0;d<knn->dim;d++){
     double xmax,xmin;
     xmin = KNNGetCoord(knn, 0, d, ierr);
     xmax = xmin;
     for(i=1; i<knn->nvec; i++){
       double f;
       f = KNNGetCoord(knn, i, d, ierr);
       if(xmax<f) xmax = f;
       if(xmin>f) xmin = f;
     }
     if(xmax!=xmin) 
        knn->scale[d] = 1./((xmax-xmin)*(xmax-xmin));
     else
        knn->scale[d] = 1.0;
   }
}


double KNNCalcDistance(struct KNN *knn, int ivec, double *mydata, int *ierr)
{
  double *data=NULL, dist=0.0;
  int i;

  data = KNNGetdata(knn, ivec, ierr);  if(*ierr) {puts("KNNCalcDistance, error, KNNGetData fail"); return;}
  
  for(i=0;i<knn->dim;i++){
    dist += (mydata[i]-data[i]) * (mydata[i]-data[i]) * knn->scale[i];
  }
  *ierr = 0;
  return sqrt(dist);
}


static void make_list(double *dlist, int* idlist, int n,  int cid, double cdist)
{
   double dmax;
   int imax, i;
   dmax = dlist[0]; imax=0;

   for(i=1;i<n;i++){
     if(dlist[i]>dmax) {dmax=dlist[i]; imax=i;}
   }
   if(cdist>dmax) 
     return;
   else{
     dlist[imax] = cdist;
     idlist[imax]= cid;
   }
}


static int vote(struct KNN* knn, int *groups, int k)
{
   int *votes;
   int i, max=0;
   votes = (int*) calloc(knn->ngroups, sizeof(int));
   for(i=0;i<knn->ngroups;i++)  votes[i]=0;

   for(i=0; i<k; i++)
     votes[ groups[i] ] ++;
     
   for(i=1; i<knn->ngroups; i++) {
     if(votes[i]>votes[max]) max=i;
   }

   return max;
}


int KNNCalcGroup(struct KNN* knn, double *mydata, int k, int* ierr)
{
  double* dists;
  int*    vecid;
  int n;
  double dmax,dmin;
  int    imax,imin;
  dists = (double*) calloc(k, sizeof(double));
  if(dists==NULL) {*ierr=-1; puts("KNNCalcGroup, error, cannot allocate space for dists"); return;}
  
  vecid = (int*) calloc(k, sizeof(int));
  if(vecid==NULL) {*ierr=-1; puts("KNNCalcGroup, error, cannot allocate space for vecid"); return;}
  for(n=0;n<k;n++){
    dists[n] = -1.;
    vecid[n] = -1;
  }
  for(n=0;n<knn->nvec;n++){
    double cdist;
    cdist = KNNCalcDistance(knn, n, mydata, ierr);
    
    if(n==0){
      int l;
      for(l=0;l<k;l++) {
         dists[l] = cdist; dmax=cdist;dmin=cdist;
         vecid[l] = n;     imax=n;    imin=n;
      }
    }
    else{ 
      make_list(dists, vecid, k,  n, cdist);
    }
  }
  free(dists);
  
  for(n=0;n<k;n++) vecid[n] = knn->group[vecid[n]];
  n = vote(knn, vecid,k);
  free(vecid);
  return n;
}

测试手写数字识别的程序 digits.c

#include <stdio.h>

#include "knn.h"

#define NVEC 1934
#define DIM  1024
#define NGROUP 10
#define K 3

void InputData(struct KNN *knn)
{
  FILE* fp;
  int id=0, i, ndata[10]={188, 197, 194, 198, 185, 186, 194, 200, 179, 203};
  double mydata[DIM];
  int g, n, ierr;
  char fname[32];
  
  for(g=0;g<10;g++)
  for(n=0;n<=ndata[g];n++){
  
    double mydata[1024];
    sprintf(fname,"trainingDigits/%d_%d.txt", g,n);
    //puts(fname);
    fp = fopen(fname,"r"); 
    for(i=0;i<1024;i++){
      int d;
      fscanf(fp,"%1d",&d);
      mydata[i] = d;
    }
    
    KNNSetDataGroup(knn, id, mydata, g, &ierr);
    if(ierr!=0) puts("ERROR!!");
    id ++;
    fclose(fp);
  }
  printf("Read %d training data\n",id);
  
}


void Test(struct KNN *knn)
{

  FILE* fp;
  int id=0, i, ndata[10]={86, 96, 91, 84, 99, 99, 86, 95, 90, 88};
  double mydata[DIM];
  int g, n, ierr,score;
  char fname[32];
  
  for(g=0;g<10;g++)
  {
    score = 0;
    for(n=0;n<=ndata[g];n++){
  
      double mydata[1024];
      int ans;
      
      sprintf(fname,"testDigits/%d_%d.txt", g,n);
      fp = fopen(fname,"r");       
      for(i=0;i<1024;i++){
        int d;
        fscanf(fp,"%1d",&d);
        mydata[i] = d;
      }          
      fclose(fp);
    
      ans = KNNCalcGroup(knn, mydata, K, &ierr);
      if(ans==g) score ++;
      else{
         printf("                                 fail for %s\n",fname);
      }
    }
    
    printf("Digit %1d  score=%3d/%3d = %6.2f%%\n",
       g, score, ndata[g]+1,  100.*score/(ndata[g]+1.));
  }
   
}



int main(void)
{
   int ierr;
   struct KNN* knn;
   knn = KNNInit(NVEC,DIM,NGROUP, &ierr);
   if(ierr) {puts("can not KNNInit"); return; }

   InputData(knn);
   KNNCalcScale(knn, &ierr);

   Test(knn);
   KNNFree(knn, &ierr);
}

 




Host by is-Programmer.com | Power by Chito 1.3.3 beta | © 2007 LinuxGem | Design by Matthew "Agent Spork" McGee