kmean算法实现 - 悲催的科学匠人 - 冷水's blog

kmean算法实现

冷水 posted @ 2014年7月06日 19:25 in C++ with tags 机器学习 kmean , 1677 阅读

具体描述见machine learning in action

main函数是一个二维点cluster测试

 

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
struct KMEAN{
int nvec,dim,ngroup;
double *data,     /*  [nvec][dim] */
       *mu,       /*  [ngroup][dim] */
       *scale, *min;    /*  [dim] */
int  *groups,     /* nvec */
     *count;      /* ngroup] */
};


struct KMEAN* KMEAN_init(int nvec, int dim, int ngroup)
{
  struct KMEAN* kmean = (struct KMEAN*) calloc(1,sizeof(struct KMEAN));
  kmean->nvec = nvec;
  kmean->dim  = dim;
  kmean->ngroup = ngroup;
  
  kmean->data  = (double*) calloc(nvec*dim, sizeof(double));
  kmean->mu    = (double*) calloc(ngroup*dim, sizeof(double));
  kmean->scale = (double*) calloc(dim, sizeof(double));
  kmean->min   = (double*) calloc(dim, sizeof(double));
  //kmean->max   = (double*) calloc(dim, sizeof(double));
  kmean->groups = (int*) calloc(nvec, sizeof(int));
  //kmean->mark   = (int*) calloc(nvec, sizeof(int));
  kmean->count  = (int*) calloc(ngroup, sizeof(int));
}



void KMEAN_free(struct KMEAN* kmean)
{
  free(kmean->data);
  free(kmean->mu);
  free(kmean->min);
  //free(kmean->max);
  free(kmean->scale);
  free(kmean->groups);
  //free(kmean->mark);
  free(kmean->count);
}

inline static double dataij(struct KMEAN* kmean, int i, int j)
{
   return kmean->data[i*kmean->dim + j];
}

inline static double muij(struct KMEAN *kmean, int i, int j)
{
  return kmean->mu[i*kmean->dim + j];
}

inline static void setdataij(struct KMEAN* kmean, int i, int j, double v)
{
   kmean->data[i*kmean->dim + j] = v;
}

inline static void setmuij(struct KMEAN *kmean, int i, int j, double v)
{
  kmean->mu[i*kmean->dim + j] = v;
}

inline static void addmuij(struct KMEAN *kmean, int i, int j, double v)
{
  kmean->mu[i*kmean->dim + j] += v;
}



void KMEAN_OutputPlt(struct KMEAN* kmean, const char* fname)
{
   FILE *fp = fopen(fname,"w");
   int n,d;
   if(fp==NULL) {puts("Can not open file for outputing tecplot"); return;}
   
   fprintf(fp,"ZONE T=\"DATA-%s\"\n", fname);
   for(n=0;n<kmean->nvec;n++)
   {
      for(d=0;d<kmean->dim;d++)
        fprintf(fp,"%11.4E ", dataij(kmean,n,d));
      fprintf(fp,"%d\n", kmean->groups[n] );
    }
   fprintf(fp,"ZONE T=\"MU-%s\"\n", fname);
   for(n=0;n<kmean->ngroup;n++)
   {
      for(d=0;d<kmean->dim;d++)
        fprintf(fp,"%11.4E ", muij(kmean,n,d));
      fprintf(fp,"%d\n", n );
    }
   
   fclose(fp);
}

void KMEAN_CalcScale(struct KMEAN *kmean)
{
  int d,n;
  double dmax,dmin;
  
  // compute range of each dim
  for(d=0;d<kmean->dim;d++){
    dmax = dmin = dataij(kmean,0,d);
    for(n=1;n<kmean->nvec;n++){
      double x;
      x = dataij(kmean,n,d);
      if( dmax<x )  dmax = x;
      if( dmin>x )  dmin = x;
    }    
    kmean->scale[d] = dmax - dmin;
    kmean->min[d] = dmin;
    //kmean->max[d] = dmax;
    //printf("Scaling: DIM %4d min=%11.4E max=%11.4E\n",d,dmin,dmax);
  }
  
  // randomly init mu
  for(n=0;n<kmean->ngroup;n++){
    //printf("init mu Group %3d ",n);
    for(d=0;d<kmean->dim;d++){
      setmuij(kmean, n,d,   kmean->min[d] + 0.8*(rand()/(double)(RAND_MAX)) * kmean->scale[d] );
      
      //printf("x_%3d=%11.4E ",d,muij(kmean,n,d));
    }
    //puts("");
  }
  
  for(n=0;n<kmean->nvec;n++) kmean->groups[n] = 0;
  for(d=0;d<kmean->dim;d++)
    kmean->scale[d] = 1.0/ (kmean->scale[d] * kmean->scale[d]);
}

static int WhichGroup(struct KMEAN* kmean, int i)
{
   int g;
   double mindist;
   int mingroup;
   
   
   for(g=0;g<kmean->ngroup;g++){
     double dist=0;
     int d;
     for(d=0;d<kmean->dim;d++){
       double xid,mgd;
       xid =  dataij(kmean,i,d);
       mgd = muij(kmean,g,d);
       dist += (xid-mgd) * (xid-mgd) * kmean->scale[d];
     }
     dist = sqrt(dist);
     if(g==0) {
       mindist =  dist; mingroup = 0;
     }else if(mindist>dist){
       mindist =  dist; mingroup = g;
     }
   }
   return mingroup;
}


static double KMEAN_error(struct KMEAN* kmean)
{
   double err=0.0;
   int n;
   for(n=0;n<kmean->nvec;n++)
   {
     int d,g;
     g = kmean->groups[n];
     for(d=0;d<kmean->dim;d++)
     {
        double dd = dataij(kmean,n,d) - muij(kmean,g,d);
        err += dd*dd*kmean->scale[d];
     }
   }
   return err;
}


static void UpdateMu(struct KMEAN* kmean)
{
   int n,d;
   for(n=0;n<kmean->ngroup;n++) 
   for(d=0;d<kmean->dim;d++)
      setmuij(kmean,n,d,0.0);
      
   for(n=0;n<kmean->nvec;n++)
   for(d=0;d<kmean->dim;d++){
     addmuij(kmean, kmean->groups[n], d, dataij(kmean,n,d) );
   }
   
   for(n=0;n<kmean->ngroup;n++) 
   for(d=0;d<kmean->dim;d++)
      setmuij(kmean,n,d,   muij(kmean,n,d)/kmean->count[n] );
}

static void KMEAN_sweep(struct KMEAN* kmean, double *err, int *changed)
{
   int n;
   
   *changed = 0;
   for(n=0;n<kmean->ngroup;n++)  kmean->count[n] = 0;
   
   for(n=0;n<kmean->nvec;n++){
     int g;
     g = WhichGroup(kmean,n);
     if(kmean->groups[n] != g) (*changed)++;
     kmean->groups[n] = g;
     kmean->count[ kmean->groups[n] ] ++;
     //printf("Vec %3d is Group %3d\n",n,g);
   }
   UpdateMu(kmean);
   *err =  KMEAN_error(kmean);
   
}

double KMEAN_cluster(struct KMEAN *kmean)
{
   int nchanged,it=0;
   double err;
   
   KMEAN_CalcScale(kmean);
   do{
     KMEAN_sweep(kmean, &err, &nchanged);
     //printf("%6d changed %6d  err=%11.4E\n",it++,nchanged, err);
   }while(nchanged>0);
}


void KMEAN_bisect(struct KMEAN *kmean, double *err)
{
   
   int g, worstg;
   double err0,minderr;
   char fname[1024];
   err0 = *err;
   minderr = err0;
   
   
   sprintf(fname,"skmean-g%d.plt",kmean->ngroup);
   KMEAN_OutputPlt(kmean, fname);
   
   // 找到最烂的分组
   for(g=0;g<kmean->ngroup;g++)
   {
      struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2);
      int i,si=0;
      double cerr;
      // fill group[g] into subset
      for(i=0;i<kmean->nvec;i++){
        if(kmean->groups[i]==g){
          int d;
          for(d=0;d<kmean->dim;d++)  setdataij(subset,si,d,  dataij(kmean,i,d));
          //subset->mark[si] = kmean->mark[i];
          si++;
        }
      }
      
      cerr = KMEAN_cluster(subset);
      KMEAN_free(subset);
      if(minderr > err0-cerr) {minderr = err0-cerr; worstg = g;}
   }
   
   g = worstg;   // 最烂的分组
   {
      struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2);
      int i,si=0,d;
      double cerr;
      // fill group[g] into subset
      for(i=0;i<kmean->nvec;i++){
        if(kmean->groups[i]==g){
          for(d=0;d<kmean->dim;d++)  setdataij(subset,si,d,  dataij(kmean,i,d));
          //subset->mark[si] = kmean->mark[i];
          si++;
        }
      }
      // 更新mu
      for(d=0;d<kmean->dim;d++) {
         setmuij(kmean, g, d,  muij(subset, 0, d) );
         setmuij(kmean, kmean->ngroup, d,  muij(subset, 1, d) );
      }
      kmean->ngroup++;
      KMEAN_free(subset);
      
      // 重新对kmean做一次分组
      *err = KMEAN_cluster(kmean);
   }
   
   return;
}



void KMEAN_BISECT(struct KMEAN* kmean)
{
    int  it=0,dest_ngroup = kmean->ngroup;
    double err;
    
    if(dest_ngroup<2) return;
    
    kmean->ngroup = 2;
    err = KMEAN_cluster(kmean);
    
    do{
       KMEAN_bisect(kmean, &err);
       printf("Step %6d err=%11.4E\n",++it, err);
    }while(kmean->ngroup<dest_ngroup);    
}








int main(void)
{
    struct KMEAN *kmean = KMEAN_init(80, 2, 4);
    double err;
    
    {
      FILE* fp = fopen("testSet.txt","r");
      int n=0;
      for(n=0;n<80;n++){
         double x,y;
         fscanf(fp,"%lf%lf", &x,&y);
         setdataij(kmean,n,0, x);
         setdataij(kmean,n,1, y);
      }
      fclose(fp);
    }
    /*  下面这一行是采用普通kmean */
    //err = KMEAN_cluster(kmean);  printf("err=%11.4E\n", err);
    
    /*  下面这一行是采用二分kmean */
    KMEAN_BISECT(kmean);
    
    KMEAN_OutputPlt(kmean, "skmean-g4.plt");
    
    KMEAN_free(kmean);

}
Avatar_small
Orissa 10th Matric I 说:
2022年8月17日 02:28

On the official website, the BSE Odisha Matric Model Question Paper 2023 and Madhyama Question Paper 2023 have been made available. The Orissa HSC & 10th Model Question Paper 2023, the Odisha HSC 10th Sample Question Paper, the Odisha HSC 10th Exam Guess Paper, and the Odisha HSC 10th Exam New Model Paper were all made available on the official website by the Odisha Board in January 2023. Orissa 10th Matric Important Question Paper 2022 Candidates looking for the Odisha Board 10th New Question Paper 2023 will soon be able to view their exam questions as well as the New Question Paper of 2023 on the board's official website. BSE announcements shall be made by the Odisha Board of Education. 10th HSC New Question Paper for Odisha 2023

Avatar_small
Alyssa 说:
2023年1月01日 21:16

The k-means++ algorithm is an improvement to the original k-means algorithm. The ++ in the name refers to the two steps of the pure cbd oil algorithm: choosing the initial centroids and then running k-means on the resulting dataset. There are a few advantages to using k-means++ over the original k-means algorithm. First, it typically converges faster. Second, it is less likely to get stuck in a local minimum. Finally, k-means++ produces more accurate results. To choose the initial centroids, the k-means++ algorithm first selects one at random.


登录 *


loading captcha image...
(输入验证码)
or Ctrl+Enter
Host by is-Programmer.com | Power by Chito 1.3.3 beta | © 2007 LinuxGem | Design by Matthew "Agent Spork" McGee