悲催的科学匠人 - 冷水's blog
kmean算法实现
具体描述见machine learning in action
main函数是一个二维点cluster测试
#include <math.h> #include <stdlib.h> #include <stdio.h> struct KMEAN{ int nvec,dim,ngroup; double *data, /* [nvec][dim] */ *mu, /* [ngroup][dim] */ *scale, *min; /* [dim] */ int *groups, /* nvec */ *count; /* ngroup] */ }; struct KMEAN* KMEAN_init(int nvec, int dim, int ngroup) { struct KMEAN* kmean = (struct KMEAN*) calloc(1,sizeof(struct KMEAN)); kmean->nvec = nvec; kmean->dim = dim; kmean->ngroup = ngroup; kmean->data = (double*) calloc(nvec*dim, sizeof(double)); kmean->mu = (double*) calloc(ngroup*dim, sizeof(double)); kmean->scale = (double*) calloc(dim, sizeof(double)); kmean->min = (double*) calloc(dim, sizeof(double)); //kmean->max = (double*) calloc(dim, sizeof(double)); kmean->groups = (int*) calloc(nvec, sizeof(int)); //kmean->mark = (int*) calloc(nvec, sizeof(int)); kmean->count = (int*) calloc(ngroup, sizeof(int)); } void KMEAN_free(struct KMEAN* kmean) { free(kmean->data); free(kmean->mu); free(kmean->min); //free(kmean->max); free(kmean->scale); free(kmean->groups); //free(kmean->mark); free(kmean->count); } inline static double dataij(struct KMEAN* kmean, int i, int j) { return kmean->data[i*kmean->dim + j]; } inline static double muij(struct KMEAN *kmean, int i, int j) { return kmean->mu[i*kmean->dim + j]; } inline static void setdataij(struct KMEAN* kmean, int i, int j, double v) { kmean->data[i*kmean->dim + j] = v; } inline static void setmuij(struct KMEAN *kmean, int i, int j, double v) { kmean->mu[i*kmean->dim + j] = v; } inline static void addmuij(struct KMEAN *kmean, int i, int j, double v) { kmean->mu[i*kmean->dim + j] += v; } void KMEAN_OutputPlt(struct KMEAN* kmean, const char* fname) { FILE *fp = fopen(fname,"w"); int n,d; if(fp==NULL) {puts("Can not open file for outputing tecplot"); return;} fprintf(fp,"ZONE T=\"DATA-%s\"\n", fname); for(n=0;n<kmean->nvec;n++) { for(d=0;d<kmean->dim;d++) fprintf(fp,"%11.4E ", dataij(kmean,n,d)); fprintf(fp,"%d\n", kmean->groups[n] ); } fprintf(fp,"ZONE T=\"MU-%s\"\n", fname); for(n=0;n<kmean->ngroup;n++) { for(d=0;d<kmean->dim;d++) fprintf(fp,"%11.4E ", muij(kmean,n,d)); fprintf(fp,"%d\n", n ); } fclose(fp); } void KMEAN_CalcScale(struct KMEAN *kmean) { int d,n; double dmax,dmin; // compute range of each dim for(d=0;d<kmean->dim;d++){ dmax = dmin = dataij(kmean,0,d); for(n=1;n<kmean->nvec;n++){ double x; x = dataij(kmean,n,d); if( dmax<x ) dmax = x; if( dmin>x ) dmin = x; } kmean->scale[d] = dmax - dmin; kmean->min[d] = dmin; //kmean->max[d] = dmax; //printf("Scaling: DIM %4d min=%11.4E max=%11.4E\n",d,dmin,dmax); } // randomly init mu for(n=0;n<kmean->ngroup;n++){ //printf("init mu Group %3d ",n); for(d=0;d<kmean->dim;d++){ setmuij(kmean, n,d, kmean->min[d] + 0.8*(rand()/(double)(RAND_MAX)) * kmean->scale[d] ); //printf("x_%3d=%11.4E ",d,muij(kmean,n,d)); } //puts(""); } for(n=0;n<kmean->nvec;n++) kmean->groups[n] = 0; for(d=0;d<kmean->dim;d++) kmean->scale[d] = 1.0/ (kmean->scale[d] * kmean->scale[d]); } static int WhichGroup(struct KMEAN* kmean, int i) { int g; double mindist; int mingroup; for(g=0;g<kmean->ngroup;g++){ double dist=0; int d; for(d=0;d<kmean->dim;d++){ double xid,mgd; xid = dataij(kmean,i,d); mgd = muij(kmean,g,d); dist += (xid-mgd) * (xid-mgd) * kmean->scale[d]; } dist = sqrt(dist); if(g==0) { mindist = dist; mingroup = 0; }else if(mindist>dist){ mindist = dist; mingroup = g; } } return mingroup; } static double KMEAN_error(struct KMEAN* kmean) { double err=0.0; int n; for(n=0;n<kmean->nvec;n++) { int d,g; g = kmean->groups[n]; for(d=0;d<kmean->dim;d++) { double dd = dataij(kmean,n,d) - muij(kmean,g,d); err += dd*dd*kmean->scale[d]; } } return err; } static void UpdateMu(struct KMEAN* kmean) { int n,d; for(n=0;n<kmean->ngroup;n++) for(d=0;d<kmean->dim;d++) setmuij(kmean,n,d,0.0); for(n=0;n<kmean->nvec;n++) for(d=0;d<kmean->dim;d++){ addmuij(kmean, kmean->groups[n], d, dataij(kmean,n,d) ); } for(n=0;n<kmean->ngroup;n++) for(d=0;d<kmean->dim;d++) setmuij(kmean,n,d, muij(kmean,n,d)/kmean->count[n] ); } static void KMEAN_sweep(struct KMEAN* kmean, double *err, int *changed) { int n; *changed = 0; for(n=0;n<kmean->ngroup;n++) kmean->count[n] = 0; for(n=0;n<kmean->nvec;n++){ int g; g = WhichGroup(kmean,n); if(kmean->groups[n] != g) (*changed)++; kmean->groups[n] = g; kmean->count[ kmean->groups[n] ] ++; //printf("Vec %3d is Group %3d\n",n,g); } UpdateMu(kmean); *err = KMEAN_error(kmean); } double KMEAN_cluster(struct KMEAN *kmean) { int nchanged,it=0; double err; KMEAN_CalcScale(kmean); do{ KMEAN_sweep(kmean, &err, &nchanged); //printf("%6d changed %6d err=%11.4E\n",it++,nchanged, err); }while(nchanged>0); } void KMEAN_bisect(struct KMEAN *kmean, double *err) { int g, worstg; double err0,minderr; char fname[1024]; err0 = *err; minderr = err0; sprintf(fname,"skmean-g%d.plt",kmean->ngroup); KMEAN_OutputPlt(kmean, fname); // 找到最烂的分组 for(g=0;g<kmean->ngroup;g++) { struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2); int i,si=0; double cerr; // fill group[g] into subset for(i=0;i<kmean->nvec;i++){ if(kmean->groups[i]==g){ int d; for(d=0;d<kmean->dim;d++) setdataij(subset,si,d, dataij(kmean,i,d)); //subset->mark[si] = kmean->mark[i]; si++; } } cerr = KMEAN_cluster(subset); KMEAN_free(subset); if(minderr > err0-cerr) {minderr = err0-cerr; worstg = g;} } g = worstg; // 最烂的分组 { struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2); int i,si=0,d; double cerr; // fill group[g] into subset for(i=0;i<kmean->nvec;i++){ if(kmean->groups[i]==g){ for(d=0;d<kmean->dim;d++) setdataij(subset,si,d, dataij(kmean,i,d)); //subset->mark[si] = kmean->mark[i]; si++; } } // 更新mu for(d=0;d<kmean->dim;d++) { setmuij(kmean, g, d, muij(subset, 0, d) ); setmuij(kmean, kmean->ngroup, d, muij(subset, 1, d) ); } kmean->ngroup++; KMEAN_free(subset); // 重新对kmean做一次分组 *err = KMEAN_cluster(kmean); } return; } void KMEAN_BISECT(struct KMEAN* kmean) { int it=0,dest_ngroup = kmean->ngroup; double err; if(dest_ngroup<2) return; kmean->ngroup = 2; err = KMEAN_cluster(kmean); do{ KMEAN_bisect(kmean, &err); printf("Step %6d err=%11.4E\n",++it, err); }while(kmean->ngroup<dest_ngroup); } int main(void) { struct KMEAN *kmean = KMEAN_init(80, 2, 4); double err; { FILE* fp = fopen("testSet.txt","r"); int n=0; for(n=0;n<80;n++){ double x,y; fscanf(fp,"%lf%lf", &x,&y); setdataij(kmean,n,0, x); setdataij(kmean,n,1, y); } fclose(fp); } /* 下面这一行是采用普通kmean */ //err = KMEAN_cluster(kmean); printf("err=%11.4E\n", err); /* 下面这一行是采用二分kmean */ KMEAN_BISECT(kmean); KMEAN_OutputPlt(kmean, "skmean-g4.plt"); KMEAN_free(kmean); }
KNN算法的实现
具体描述见machine learning in action
knn.h
/* knn.c */ struct KNN *KNNInit(int nvectors, int dim, int ngroup, int *ierr); void KNNFree(struct KNN *knn, int *ierr); double *KNNGetdata(struct KNN *knn, int ivec, int *ierr); double KNNGetCoord(struct KNN *knn, int ivec, int icoord, int *ierr); void KNNSetGroup(struct KNN *knn, int ivec, int group, int *ierr); void KNNSetDataGroup(struct KNN *knn, int ivec, double *data, int group, int *ierr); void KNNCalcScale(struct KNN *knn, int *ierr); double KNNCalcDistance(struct KNN *knn, int ivec, double *mydata, int *ierr); int KNNCalcGroup(struct KNN *knn, double *mydata, int k, int *ierr);
knn.c
#include <stdlib.h> #include <math.h> #include <stdlib.h> struct KNN{ int nvec; int dim; double *data; double *scale; int ngroups; int *group; /* group ID (0,1,..., ngroups-1) for each vec*/ }; inline int KNNGetDataOffset(struct KNN *knn, int ivec) { return ivec*knn->dim; } inline int KNNCheckIvec(struct KNN* knn, int ivec) { if( ivec<0 || 1+ivec>knn->nvec) return 1; else return 0; } struct KNN* KNNInit(int nvectors, int dim, int ngroup, int *ierr) { double *data, *scale; int *group, i; if(nvectors<1) {*ierr = 1; puts("KNNInit, error, nvectors<1"); return NULL; } if(dim<1) {*ierr= 2; puts("KNNInit, error, dim<1"); return NULL; } if(ngroup<2) {*ierr=4; puts("KNNInit, error, ngroup<2"); return; } data = (double*) calloc(nvectors*dim, sizeof(double)); if (data==NULL) {*ierr = 5; puts("KNNInit, error, calloc knn->data fail"); return; } group = (int*) calloc(nvectors, sizeof(int)); if(group==NULL) {*ierr=6; puts("KNNInit, error, calloc knn->group fail"); return;} for(i=0;i<nvectors;i++) group[i] = -1; scale = (double*) calloc(dim, sizeof(double)); if(scale==NULL) {*ierr=7; puts("KNNInit, error, calloc knn->scale fail"); return;} struct KNN* knn = (struct KNN*)calloc(1,sizeof(struct KNN)); if(knn==NULL) {*ierr=999; puts("KNNInit, error, calloc KNN fail"); return NULL;} knn->data = data; knn->group = group; knn->scale = scale; knn->nvec= nvectors; knn->dim = dim; knn->ngroups = ngroup; ierr = 0; return knn; } void KNNFree(struct KNN* knn, int* ierr) { if(knn==NULL) return; if(knn->data) free(knn->data); if(knn->group) free(knn->group); free(knn); } double* KNNGetdata(struct KNN *knn, int ivec, int *ierr) { *ierr = KNNCheckIvec(knn,ivec); if(*ierr) {puts("KNNGetData, irror, Wrong ivec"); return;} return knn->data + KNNGetDataOffset(knn, ivec); } double KNNGetCoord(struct KNN* knn, int ivec, int icoord, int* ierr) { double *data; data = KNNGetdata(knn, ivec, ierr); if(*ierr) return 0; if(icoord<0 || icoord>=knn->dim) { puts("KNNGetCoord, error, wrong icoord"); *ierr= 2; return 0; } *ierr = 0; return data[icoord]; } void KNNSetGroup(struct KNN* knn, int ivec, int group, int* ierr) { *ierr = KNNCheckIvec(knn,ivec); if(*ierr) {puts("KNNSetGroup, error, Wrong ivec"); return;} *ierr = group<0 || group>=knn->ngroups; if(*ierr) {puts("KNNSetGroup, error, Wrong group ID (should be [0,..,ngroups-1])"); return;} knn->group[ivec] = group; *ierr = 0; return; } void KNNSetDataGroup(struct KNN *knn,int ivec, double *data, int group, int* ierr) { int i; double *knndata = KNNGetdata(knn, ivec, ierr); if(*ierr) return; for(i=0; i<knn->dim; i++) knndata[i] = data[i]; KNNSetGroup(knn, ivec, group, ierr); if(*ierr) return; *ierr = 0; return; } static int KNNIsDataFilled(struct KNN* knn) { int i; for(i=0; i<knn->nvec; i++) if(knn->group[i]==-1) return 0; return 1; } void KNNCalcScale(struct KNN* knn, int *ierr) { int i,d; if(!KNNIsDataFilled(knn)) { puts("KNNCalcScale, error, data is not filled"); return; } for(d=0;d<knn->dim;d++){ double xmax,xmin; xmin = KNNGetCoord(knn, 0, d, ierr); xmax = xmin; for(i=1; i<knn->nvec; i++){ double f; f = KNNGetCoord(knn, i, d, ierr); if(xmax<f) xmax = f; if(xmin>f) xmin = f; } if(xmax!=xmin) knn->scale[d] = 1./((xmax-xmin)*(xmax-xmin)); else knn->scale[d] = 1.0; } } double KNNCalcDistance(struct KNN *knn, int ivec, double *mydata, int *ierr) { double *data=NULL, dist=0.0; int i; data = KNNGetdata(knn, ivec, ierr); if(*ierr) {puts("KNNCalcDistance, error, KNNGetData fail"); return;} for(i=0;i<knn->dim;i++){ dist += (mydata[i]-data[i]) * (mydata[i]-data[i]) * knn->scale[i]; } *ierr = 0; return sqrt(dist); } static void make_list(double *dlist, int* idlist, int n, int cid, double cdist) { double dmax; int imax, i; dmax = dlist[0]; imax=0; for(i=1;i<n;i++){ if(dlist[i]>dmax) {dmax=dlist[i]; imax=i;} } if(cdist>dmax) return; else{ dlist[imax] = cdist; idlist[imax]= cid; } } static int vote(struct KNN* knn, int *groups, int k) { int *votes; int i, max=0; votes = (int*) calloc(knn->ngroups, sizeof(int)); for(i=0;i<knn->ngroups;i++) votes[i]=0; for(i=0; i<k; i++) votes[ groups[i] ] ++; for(i=1; i<knn->ngroups; i++) { if(votes[i]>votes[max]) max=i; } return max; } int KNNCalcGroup(struct KNN* knn, double *mydata, int k, int* ierr) { double* dists; int* vecid; int n; double dmax,dmin; int imax,imin; dists = (double*) calloc(k, sizeof(double)); if(dists==NULL) {*ierr=-1; puts("KNNCalcGroup, error, cannot allocate space for dists"); return;} vecid = (int*) calloc(k, sizeof(int)); if(vecid==NULL) {*ierr=-1; puts("KNNCalcGroup, error, cannot allocate space for vecid"); return;} for(n=0;n<k;n++){ dists[n] = -1.; vecid[n] = -1; } for(n=0;n<knn->nvec;n++){ double cdist; cdist = KNNCalcDistance(knn, n, mydata, ierr); if(n==0){ int l; for(l=0;l<k;l++) { dists[l] = cdist; dmax=cdist;dmin=cdist; vecid[l] = n; imax=n; imin=n; } } else{ make_list(dists, vecid, k, n, cdist); } } free(dists); for(n=0;n<k;n++) vecid[n] = knn->group[vecid[n]]; n = vote(knn, vecid,k); free(vecid); return n; }
测试手写数字识别的程序 digits.c
#include <stdio.h> #include "knn.h" #define NVEC 1934 #define DIM 1024 #define NGROUP 10 #define K 3 void InputData(struct KNN *knn) { FILE* fp; int id=0, i, ndata[10]={188, 197, 194, 198, 185, 186, 194, 200, 179, 203}; double mydata[DIM]; int g, n, ierr; char fname[32]; for(g=0;g<10;g++) for(n=0;n<=ndata[g];n++){ double mydata[1024]; sprintf(fname,"trainingDigits/%d_%d.txt", g,n); //puts(fname); fp = fopen(fname,"r"); for(i=0;i<1024;i++){ int d; fscanf(fp,"%1d",&d); mydata[i] = d; } KNNSetDataGroup(knn, id, mydata, g, &ierr); if(ierr!=0) puts("ERROR!!"); id ++; fclose(fp); } printf("Read %d training data\n",id); } void Test(struct KNN *knn) { FILE* fp; int id=0, i, ndata[10]={86, 96, 91, 84, 99, 99, 86, 95, 90, 88}; double mydata[DIM]; int g, n, ierr,score; char fname[32]; for(g=0;g<10;g++) { score = 0; for(n=0;n<=ndata[g];n++){ double mydata[1024]; int ans; sprintf(fname,"testDigits/%d_%d.txt", g,n); fp = fopen(fname,"r"); for(i=0;i<1024;i++){ int d; fscanf(fp,"%1d",&d); mydata[i] = d; } fclose(fp); ans = KNNCalcGroup(knn, mydata, K, &ierr); if(ans==g) score ++; else{ printf(" fail for %s\n",fname); } } printf("Digit %1d score=%3d/%3d = %6.2f%%\n", g, score, ndata[g]+1, 100.*score/(ndata[g]+1.)); } } int main(void) { int ierr; struct KNN* knn; knn = KNNInit(NVEC,DIM,NGROUP, &ierr); if(ierr) {puts("can not KNNInit"); return; } InputData(knn); KNNCalcScale(knn, &ierr); Test(knn); KNNFree(knn, &ierr); }