悲催的科学匠人 - 冷水's blog
kmean算法实现
具体描述见machine learning in action
main函数是一个二维点cluster测试
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
struct KMEAN{
int nvec,dim,ngroup;
double *data, /* [nvec][dim] */
*mu, /* [ngroup][dim] */
*scale, *min; /* [dim] */
int *groups, /* nvec */
*count; /* ngroup] */
};
struct KMEAN* KMEAN_init(int nvec, int dim, int ngroup)
{
struct KMEAN* kmean = (struct KMEAN*) calloc(1,sizeof(struct KMEAN));
kmean->nvec = nvec;
kmean->dim = dim;
kmean->ngroup = ngroup;
kmean->data = (double*) calloc(nvec*dim, sizeof(double));
kmean->mu = (double*) calloc(ngroup*dim, sizeof(double));
kmean->scale = (double*) calloc(dim, sizeof(double));
kmean->min = (double*) calloc(dim, sizeof(double));
//kmean->max = (double*) calloc(dim, sizeof(double));
kmean->groups = (int*) calloc(nvec, sizeof(int));
//kmean->mark = (int*) calloc(nvec, sizeof(int));
kmean->count = (int*) calloc(ngroup, sizeof(int));
}
void KMEAN_free(struct KMEAN* kmean)
{
free(kmean->data);
free(kmean->mu);
free(kmean->min);
//free(kmean->max);
free(kmean->scale);
free(kmean->groups);
//free(kmean->mark);
free(kmean->count);
}
inline static double dataij(struct KMEAN* kmean, int i, int j)
{
return kmean->data[i*kmean->dim + j];
}
inline static double muij(struct KMEAN *kmean, int i, int j)
{
return kmean->mu[i*kmean->dim + j];
}
inline static void setdataij(struct KMEAN* kmean, int i, int j, double v)
{
kmean->data[i*kmean->dim + j] = v;
}
inline static void setmuij(struct KMEAN *kmean, int i, int j, double v)
{
kmean->mu[i*kmean->dim + j] = v;
}
inline static void addmuij(struct KMEAN *kmean, int i, int j, double v)
{
kmean->mu[i*kmean->dim + j] += v;
}
void KMEAN_OutputPlt(struct KMEAN* kmean, const char* fname)
{
FILE *fp = fopen(fname,"w");
int n,d;
if(fp==NULL) {puts("Can not open file for outputing tecplot"); return;}
fprintf(fp,"ZONE T=\"DATA-%s\"\n", fname);
for(n=0;n<kmean->nvec;n++)
{
for(d=0;d<kmean->dim;d++)
fprintf(fp,"%11.4E ", dataij(kmean,n,d));
fprintf(fp,"%d\n", kmean->groups[n] );
}
fprintf(fp,"ZONE T=\"MU-%s\"\n", fname);
for(n=0;n<kmean->ngroup;n++)
{
for(d=0;d<kmean->dim;d++)
fprintf(fp,"%11.4E ", muij(kmean,n,d));
fprintf(fp,"%d\n", n );
}
fclose(fp);
}
void KMEAN_CalcScale(struct KMEAN *kmean)
{
int d,n;
double dmax,dmin;
// compute range of each dim
for(d=0;d<kmean->dim;d++){
dmax = dmin = dataij(kmean,0,d);
for(n=1;n<kmean->nvec;n++){
double x;
x = dataij(kmean,n,d);
if( dmax<x ) dmax = x;
if( dmin>x ) dmin = x;
}
kmean->scale[d] = dmax - dmin;
kmean->min[d] = dmin;
//kmean->max[d] = dmax;
//printf("Scaling: DIM %4d min=%11.4E max=%11.4E\n",d,dmin,dmax);
}
// randomly init mu
for(n=0;n<kmean->ngroup;n++){
//printf("init mu Group %3d ",n);
for(d=0;d<kmean->dim;d++){
setmuij(kmean, n,d, kmean->min[d] + 0.8*(rand()/(double)(RAND_MAX)) * kmean->scale[d] );
//printf("x_%3d=%11.4E ",d,muij(kmean,n,d));
}
//puts("");
}
for(n=0;n<kmean->nvec;n++) kmean->groups[n] = 0;
for(d=0;d<kmean->dim;d++)
kmean->scale[d] = 1.0/ (kmean->scale[d] * kmean->scale[d]);
}
static int WhichGroup(struct KMEAN* kmean, int i)
{
int g;
double mindist;
int mingroup;
for(g=0;g<kmean->ngroup;g++){
double dist=0;
int d;
for(d=0;d<kmean->dim;d++){
double xid,mgd;
xid = dataij(kmean,i,d);
mgd = muij(kmean,g,d);
dist += (xid-mgd) * (xid-mgd) * kmean->scale[d];
}
dist = sqrt(dist);
if(g==0) {
mindist = dist; mingroup = 0;
}else if(mindist>dist){
mindist = dist; mingroup = g;
}
}
return mingroup;
}
static double KMEAN_error(struct KMEAN* kmean)
{
double err=0.0;
int n;
for(n=0;n<kmean->nvec;n++)
{
int d,g;
g = kmean->groups[n];
for(d=0;d<kmean->dim;d++)
{
double dd = dataij(kmean,n,d) - muij(kmean,g,d);
err += dd*dd*kmean->scale[d];
}
}
return err;
}
static void UpdateMu(struct KMEAN* kmean)
{
int n,d;
for(n=0;n<kmean->ngroup;n++)
for(d=0;d<kmean->dim;d++)
setmuij(kmean,n,d,0.0);
for(n=0;n<kmean->nvec;n++)
for(d=0;d<kmean->dim;d++){
addmuij(kmean, kmean->groups[n], d, dataij(kmean,n,d) );
}
for(n=0;n<kmean->ngroup;n++)
for(d=0;d<kmean->dim;d++)
setmuij(kmean,n,d, muij(kmean,n,d)/kmean->count[n] );
}
static void KMEAN_sweep(struct KMEAN* kmean, double *err, int *changed)
{
int n;
*changed = 0;
for(n=0;n<kmean->ngroup;n++) kmean->count[n] = 0;
for(n=0;n<kmean->nvec;n++){
int g;
g = WhichGroup(kmean,n);
if(kmean->groups[n] != g) (*changed)++;
kmean->groups[n] = g;
kmean->count[ kmean->groups[n] ] ++;
//printf("Vec %3d is Group %3d\n",n,g);
}
UpdateMu(kmean);
*err = KMEAN_error(kmean);
}
double KMEAN_cluster(struct KMEAN *kmean)
{
int nchanged,it=0;
double err;
KMEAN_CalcScale(kmean);
do{
KMEAN_sweep(kmean, &err, &nchanged);
//printf("%6d changed %6d err=%11.4E\n",it++,nchanged, err);
}while(nchanged>0);
}
void KMEAN_bisect(struct KMEAN *kmean, double *err)
{
int g, worstg;
double err0,minderr;
char fname[1024];
err0 = *err;
minderr = err0;
sprintf(fname,"skmean-g%d.plt",kmean->ngroup);
KMEAN_OutputPlt(kmean, fname);
// 找到最烂的分组
for(g=0;g<kmean->ngroup;g++)
{
struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2);
int i,si=0;
double cerr;
// fill group[g] into subset
for(i=0;i<kmean->nvec;i++){
if(kmean->groups[i]==g){
int d;
for(d=0;d<kmean->dim;d++) setdataij(subset,si,d, dataij(kmean,i,d));
//subset->mark[si] = kmean->mark[i];
si++;
}
}
cerr = KMEAN_cluster(subset);
KMEAN_free(subset);
if(minderr > err0-cerr) {minderr = err0-cerr; worstg = g;}
}
g = worstg; // 最烂的分组
{
struct KMEAN* subset = KMEAN_init(kmean->count[g], kmean->dim, 2);
int i,si=0,d;
double cerr;
// fill group[g] into subset
for(i=0;i<kmean->nvec;i++){
if(kmean->groups[i]==g){
for(d=0;d<kmean->dim;d++) setdataij(subset,si,d, dataij(kmean,i,d));
//subset->mark[si] = kmean->mark[i];
si++;
}
}
// 更新mu
for(d=0;d<kmean->dim;d++) {
setmuij(kmean, g, d, muij(subset, 0, d) );
setmuij(kmean, kmean->ngroup, d, muij(subset, 1, d) );
}
kmean->ngroup++;
KMEAN_free(subset);
// 重新对kmean做一次分组
*err = KMEAN_cluster(kmean);
}
return;
}
void KMEAN_BISECT(struct KMEAN* kmean)
{
int it=0,dest_ngroup = kmean->ngroup;
double err;
if(dest_ngroup<2) return;
kmean->ngroup = 2;
err = KMEAN_cluster(kmean);
do{
KMEAN_bisect(kmean, &err);
printf("Step %6d err=%11.4E\n",++it, err);
}while(kmean->ngroup<dest_ngroup);
}
int main(void)
{
struct KMEAN *kmean = KMEAN_init(80, 2, 4);
double err;
{
FILE* fp = fopen("testSet.txt","r");
int n=0;
for(n=0;n<80;n++){
double x,y;
fscanf(fp,"%lf%lf", &x,&y);
setdataij(kmean,n,0, x);
setdataij(kmean,n,1, y);
}
fclose(fp);
}
/* 下面这一行是采用普通kmean */
//err = KMEAN_cluster(kmean); printf("err=%11.4E\n", err);
/* 下面这一行是采用二分kmean */
KMEAN_BISECT(kmean);
KMEAN_OutputPlt(kmean, "skmean-g4.plt");
KMEAN_free(kmean);
}
KNN算法的实现
具体描述见machine learning in action
knn.h
/* knn.c */ struct KNN *KNNInit(int nvectors, int dim, int ngroup, int *ierr); void KNNFree(struct KNN *knn, int *ierr); double *KNNGetdata(struct KNN *knn, int ivec, int *ierr); double KNNGetCoord(struct KNN *knn, int ivec, int icoord, int *ierr); void KNNSetGroup(struct KNN *knn, int ivec, int group, int *ierr); void KNNSetDataGroup(struct KNN *knn, int ivec, double *data, int group, int *ierr); void KNNCalcScale(struct KNN *knn, int *ierr); double KNNCalcDistance(struct KNN *knn, int ivec, double *mydata, int *ierr); int KNNCalcGroup(struct KNN *knn, double *mydata, int k, int *ierr);
knn.c
#include <stdlib.h>
#include <math.h>
#include <stdlib.h>
struct KNN{
int nvec;
int dim;
double *data;
double *scale;
int ngroups;
int *group; /* group ID (0,1,..., ngroups-1) for each vec*/
};
inline int KNNGetDataOffset(struct KNN *knn, int ivec)
{
return ivec*knn->dim;
}
inline int KNNCheckIvec(struct KNN* knn, int ivec)
{
if( ivec<0 || 1+ivec>knn->nvec)
return 1;
else
return 0;
}
struct KNN* KNNInit(int nvectors, int dim, int ngroup, int *ierr)
{
double *data, *scale;
int *group, i;
if(nvectors<1) {*ierr = 1; puts("KNNInit, error, nvectors<1"); return NULL; }
if(dim<1) {*ierr= 2; puts("KNNInit, error, dim<1"); return NULL; }
if(ngroup<2) {*ierr=4; puts("KNNInit, error, ngroup<2"); return; }
data = (double*) calloc(nvectors*dim, sizeof(double));
if (data==NULL) {*ierr = 5; puts("KNNInit, error, calloc knn->data fail"); return; }
group = (int*) calloc(nvectors, sizeof(int));
if(group==NULL) {*ierr=6; puts("KNNInit, error, calloc knn->group fail"); return;}
for(i=0;i<nvectors;i++) group[i] = -1;
scale = (double*) calloc(dim, sizeof(double));
if(scale==NULL) {*ierr=7; puts("KNNInit, error, calloc knn->scale fail"); return;}
struct KNN* knn = (struct KNN*)calloc(1,sizeof(struct KNN));
if(knn==NULL) {*ierr=999; puts("KNNInit, error, calloc KNN fail"); return NULL;}
knn->data = data;
knn->group = group;
knn->scale = scale;
knn->nvec= nvectors;
knn->dim = dim;
knn->ngroups = ngroup;
ierr = 0;
return knn;
}
void KNNFree(struct KNN* knn, int* ierr)
{
if(knn==NULL) return;
if(knn->data) free(knn->data);
if(knn->group) free(knn->group);
free(knn);
}
double* KNNGetdata(struct KNN *knn, int ivec, int *ierr)
{
*ierr = KNNCheckIvec(knn,ivec);
if(*ierr) {puts("KNNGetData, irror, Wrong ivec"); return;}
return knn->data + KNNGetDataOffset(knn, ivec);
}
double KNNGetCoord(struct KNN* knn, int ivec, int icoord, int* ierr)
{
double *data;
data = KNNGetdata(knn, ivec, ierr);
if(*ierr) return 0;
if(icoord<0 || icoord>=knn->dim) {
puts("KNNGetCoord, error, wrong icoord"); *ierr= 2; return 0;
}
*ierr = 0;
return data[icoord];
}
void KNNSetGroup(struct KNN* knn, int ivec, int group, int* ierr)
{
*ierr = KNNCheckIvec(knn,ivec); if(*ierr) {puts("KNNSetGroup, error, Wrong ivec"); return;}
*ierr = group<0 || group>=knn->ngroups; if(*ierr) {puts("KNNSetGroup, error, Wrong group ID (should be [0,..,ngroups-1])"); return;}
knn->group[ivec] = group;
*ierr = 0;
return;
}
void KNNSetDataGroup(struct KNN *knn,int ivec, double *data, int group, int* ierr)
{
int i;
double *knndata = KNNGetdata(knn, ivec, ierr);
if(*ierr) return;
for(i=0; i<knn->dim; i++) knndata[i] = data[i];
KNNSetGroup(knn, ivec, group, ierr);
if(*ierr) return;
*ierr = 0;
return;
}
static int KNNIsDataFilled(struct KNN* knn)
{
int i;
for(i=0; i<knn->nvec; i++) if(knn->group[i]==-1) return 0;
return 1;
}
void KNNCalcScale(struct KNN* knn, int *ierr)
{
int i,d;
if(!KNNIsDataFilled(knn)) { puts("KNNCalcScale, error, data is not filled"); return; }
for(d=0;d<knn->dim;d++){
double xmax,xmin;
xmin = KNNGetCoord(knn, 0, d, ierr);
xmax = xmin;
for(i=1; i<knn->nvec; i++){
double f;
f = KNNGetCoord(knn, i, d, ierr);
if(xmax<f) xmax = f;
if(xmin>f) xmin = f;
}
if(xmax!=xmin)
knn->scale[d] = 1./((xmax-xmin)*(xmax-xmin));
else
knn->scale[d] = 1.0;
}
}
double KNNCalcDistance(struct KNN *knn, int ivec, double *mydata, int *ierr)
{
double *data=NULL, dist=0.0;
int i;
data = KNNGetdata(knn, ivec, ierr); if(*ierr) {puts("KNNCalcDistance, error, KNNGetData fail"); return;}
for(i=0;i<knn->dim;i++){
dist += (mydata[i]-data[i]) * (mydata[i]-data[i]) * knn->scale[i];
}
*ierr = 0;
return sqrt(dist);
}
static void make_list(double *dlist, int* idlist, int n, int cid, double cdist)
{
double dmax;
int imax, i;
dmax = dlist[0]; imax=0;
for(i=1;i<n;i++){
if(dlist[i]>dmax) {dmax=dlist[i]; imax=i;}
}
if(cdist>dmax)
return;
else{
dlist[imax] = cdist;
idlist[imax]= cid;
}
}
static int vote(struct KNN* knn, int *groups, int k)
{
int *votes;
int i, max=0;
votes = (int*) calloc(knn->ngroups, sizeof(int));
for(i=0;i<knn->ngroups;i++) votes[i]=0;
for(i=0; i<k; i++)
votes[ groups[i] ] ++;
for(i=1; i<knn->ngroups; i++) {
if(votes[i]>votes[max]) max=i;
}
return max;
}
int KNNCalcGroup(struct KNN* knn, double *mydata, int k, int* ierr)
{
double* dists;
int* vecid;
int n;
double dmax,dmin;
int imax,imin;
dists = (double*) calloc(k, sizeof(double));
if(dists==NULL) {*ierr=-1; puts("KNNCalcGroup, error, cannot allocate space for dists"); return;}
vecid = (int*) calloc(k, sizeof(int));
if(vecid==NULL) {*ierr=-1; puts("KNNCalcGroup, error, cannot allocate space for vecid"); return;}
for(n=0;n<k;n++){
dists[n] = -1.;
vecid[n] = -1;
}
for(n=0;n<knn->nvec;n++){
double cdist;
cdist = KNNCalcDistance(knn, n, mydata, ierr);
if(n==0){
int l;
for(l=0;l<k;l++) {
dists[l] = cdist; dmax=cdist;dmin=cdist;
vecid[l] = n; imax=n; imin=n;
}
}
else{
make_list(dists, vecid, k, n, cdist);
}
}
free(dists);
for(n=0;n<k;n++) vecid[n] = knn->group[vecid[n]];
n = vote(knn, vecid,k);
free(vecid);
return n;
}
测试手写数字识别的程序 digits.c
#include <stdio.h>
#include "knn.h"
#define NVEC 1934
#define DIM 1024
#define NGROUP 10
#define K 3
void InputData(struct KNN *knn)
{
FILE* fp;
int id=0, i, ndata[10]={188, 197, 194, 198, 185, 186, 194, 200, 179, 203};
double mydata[DIM];
int g, n, ierr;
char fname[32];
for(g=0;g<10;g++)
for(n=0;n<=ndata[g];n++){
double mydata[1024];
sprintf(fname,"trainingDigits/%d_%d.txt", g,n);
//puts(fname);
fp = fopen(fname,"r");
for(i=0;i<1024;i++){
int d;
fscanf(fp,"%1d",&d);
mydata[i] = d;
}
KNNSetDataGroup(knn, id, mydata, g, &ierr);
if(ierr!=0) puts("ERROR!!");
id ++;
fclose(fp);
}
printf("Read %d training data\n",id);
}
void Test(struct KNN *knn)
{
FILE* fp;
int id=0, i, ndata[10]={86, 96, 91, 84, 99, 99, 86, 95, 90, 88};
double mydata[DIM];
int g, n, ierr,score;
char fname[32];
for(g=0;g<10;g++)
{
score = 0;
for(n=0;n<=ndata[g];n++){
double mydata[1024];
int ans;
sprintf(fname,"testDigits/%d_%d.txt", g,n);
fp = fopen(fname,"r");
for(i=0;i<1024;i++){
int d;
fscanf(fp,"%1d",&d);
mydata[i] = d;
}
fclose(fp);
ans = KNNCalcGroup(knn, mydata, K, &ierr);
if(ans==g) score ++;
else{
printf(" fail for %s\n",fname);
}
}
printf("Digit %1d score=%3d/%3d = %6.2f%%\n",
g, score, ndata[g]+1, 100.*score/(ndata[g]+1.));
}
}
int main(void)
{
int ierr;
struct KNN* knn;
knn = KNNInit(NVEC,DIM,NGROUP, &ierr);
if(ierr) {puts("can not KNNInit"); return; }
InputData(knn);
KNNCalcScale(knn, &ierr);
Test(knn);
KNNFree(knn, &ierr);
}