C 4.5 알고리즘 분류 기

c 4.5 알고리즘 은 ID3 알고리즘 의 개선 으로 정보 버 프 를 사용 하여 속성 을 선택 하고 정보 버 프 를 사용 하여 속성 을 선택 할 때 선택 값 이 많은 속성의 부족 을 극복 합 니 다.정보 이득 률 은 이 중 Gain (S, A) 이 ID3 알고리즘 의 정보 이득 과 동일 하 다 고 정의 하 며, 분류 정보 SplitInfo (S, A) 는 속성 A 에 따라 샘플 집합 S 의 넓이 와 균일 성 을 나 타 냅 니 다. 그 중에서 Si 부터 Sc 까지 는 c 개의 속성 이 다른 값 의 속성 A 분할 S 로 구 성 된 c 개의 샘플 집합 입 니 다.
C 4.5 분 산 된 설명 속성 도 처리 할 수 있 고 연속 적 인 설명 속성 도 처리 할 수 있 습 니 다. 특정한 노드 의 분기 속성 을 선택 할 때 분 산 된 설명 속성. c 4.5. 의 처리 방법 은 ID3 와 같 습 니 다. 이 속성 자체 의 수치 갯 수 에 따라 계산 하고 특정한 연속 적 인 설명 속성 에 대해 작은 것 부터 큰 것 까지 각각 중점 을 분할 점 으로 한 다음 에 이 분할 점 에 따라 전후 정보 이득 률 이 가장 큰 분할 점 을 선택 하여 가 지 를 합 니 다.

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#define EPS 0.000001

typedef struct Tuple

{

	int i;

    int g;

    double h;

    int c;

}tuple;

typedef struct TNode{

    double  gap;

    int attri;

    int reachValue;

    struct TNode *child[50];

    int kind;

}node;

tuple trainData[100];

double cal_entropy(tuple *data,int len);

double choose_best_gap(tuple *data,int len);

double cal_grainRatio(tuple *data,int len);

double cal_grainRatio2(tuple *data,int len,double gap);

double cal_splitInfo(tuple *data,int len);

int check_attribute(tuple *data,int len);

int choose_attribute(tuple *data,int len);

node *build_tree(tuple *data,int len,double reachValue,double gap);

void print_blank(int depth);

void traverse(node *no,int depth);

void test_data(node *root,tuple *data);

int cmp(const void *a, const void *b)

{

    tuple *a1=(tuple *)a;

    tuple *b1=(tuple *)b;

    return a1->h-b1->h>0?1:-1;

}

void copy_tuple(tuple *source,tuple *destination)

{

    destination->c=source->c;

    destination->g=source->g;

    destination->h=source->h;

	destination->i=source->i;

}

int main()

{

    FILE *fp;

    fp=fopen("./data.txt", "r");

    if(fp==NULL)

    {

        printf("can not open the file: data.txt
");

        return 0;

    }

    char name[50];

    double height;

    char gender[10];

    char kind[10];

    int i=0;

    while(fscanf(fp, "%s",name)!=EOF)

    {

		trainData[i].i=i;

        fscanf(fp,"%s",gender);

        if(!strcmp(gender, "M"))

        {

            trainData[i].g=0;

        }

        else trainData[i].g=1;

        fscanf(fp,"%lf",&height);

        trainData[i].h=height;

        fscanf(fp,"%s",kind);

        if(!strcmp(kind, "Short"))

        {

            trainData[i].c=0;

        }

        else if(!strcmp(kind,"Medium"))

        {

            trainData[i].c=1;

        }

        else{

            trainData[i].c=2;

        }

        i++;

    }

    int rows=i;

	node *root=build_tree(trainData,rows,-1,-1);

	 traverse(root,0);printf("
");

	 fp=fopen("./testData.txt", "r");

	     if(fp==NULL)

	     {

	         printf("can not open the file!
");

	         return 0;

	     }

	     tuple testData;

	     fscanf(fp, "%s",name);

	     fscanf(fp,"%s",gender);

	     if(!strcmp(gender, "M"))

	     {

	         testData.g=0;

	     }

	     else  testData.g=1;

	     fscanf(fp,"%lf",&height);

	      testData.h=height;

	   //  printf("testData: gender: %d\theight: %lf
",testData.g,testData.h);

		 fclose(fp);

		 fp=NULL;

		 test_data(root,&testData);

}

void test_data(node *root,tuple *data)

{

	/*

     1.        

     2.        gap    <=   ，     

     3.        reachValue  

     */

    if(root->attri==-1)

    {

        printf("the test data belongs to:");

        switch (root->kind) {

            case 0: printf("Short
");break;

            case 1: printf("Medium
");break;

            case 2: printf("Tall
");break;

            default:break;

        }

		return;

    }

	if(root->attri==0)

    {

        if(data->g==0)

        {

            test_data(root->child[0],data);

        }

        else

        {

            test_data(root->child[1], data);

        }

    }

    else

    {

		//printf("gap: %lf
",root->gap);

        if(data->h<=root->gap)

        {

            test_data(root->child[0], data);

        }

        else{

            test_data(root->child[1], data);

        }

    }

}



void print_blank(int depth)

{

    int i;

    for(i=0;i<depth;i++)

    {

        printf("\t");

    }

}

void traverse(node *no,int depth)

{

    if(no==NULL)return;

    int i;

	printf("-------------------
");

	print_blank(depth);

    printf("attri: %d
",no->attri);print_blank(depth);

    printf("gap: %lf
",no->gap);print_blank(depth);

    printf("kind: %d
",no->kind);print_blank(depth);

    printf("reachValue: %d
",no->reachValue);print_blank(depth);

	printf("-------------------
");print_blank(depth);

    for(i=0;no->child[i]!=NULL;i++)

    {

        traverse(no->child[i], depth+1);

    }

}

int choose_attribute(tuple *data,int len)//      ，         

{

    int i;

    /*

     1.     ，       

     2.               gap

     3.                     

     */

    double gGrainRatio=cal_grainRatio(data, len);

	//printf("gGrainRatio: %lf
",gGrainRatio);

    /*          

     1.    gap

     2.    gap      

     3.                       

     */

    qsort(data,len,sizeof(tuple),cmp);

    double maxGap=-1;

    double maxRaito=-1;

	//printf("len: %d
",len);

    for(i=1;i<len;i++)

    {

        double gap=(data[i-1].h+data[i].h)/2;

		//printf("gap: %lf
",gap);

        double ratio=cal_grainRatio2(data,len,gap);//     gap         ratio

		//printf("ratio: %lf
",ratio);

        if(maxRaito<ratio)///        

        {

            maxRaito=ratio;maxGap=gap;//         gap

        }

    }

	//printf("maxRaito: %lf
",maxRaito);

    if(gGrainRatio>maxRaito)

    {

        return 0;

    }

    else

    {

        return 1;

    }

}



node *build_tree(tuple *data,int len,double reachValue,double gap)

{

	//getchar();getchar();

    int i,j;

	/*for(i=0;i<len;i++)

	{

		printf("data i: %d g:%d h:%lf c:%d
",data[i].i,data[i].g,data[i].h,data[i].c);

	}*/

    int kind=check_attribute(data, len);//               

	//printf("kind: %d
",kind);

    if(kind!=0)//                      

    {

	//	printf("leaves constructed completed!
");

        node *newNode=(node *)malloc(sizeof(node));

        newNode->gap=gap;//             gap；

        newNode->attri=-1;

        newNode->reachValue=reachValue;

        newNode->kind=kind-1;

        for(i=0;i<50;i++)newNode->child[i]=NULL;//          

        return newNode;

    }

    //               

    int attribute=choose_attribute(data, len);

	//printf("choose: %d
",attribute);

    //              

    node *newNode=(node *)malloc(sizeof(node));

    newNode->reachValue=reachValue;

    newNode->attri=attribute;

    newNode->kind=-1;

	newNode->gap=gap;

    for(i=0;i<50;i++)newNode->child[i]=NULL;

    if(attribute==0)//        

    {

        for(i=0;i<2;i++)

        {

            tuple subData[100];

            int sublen=0;

            for(j=0;j<len;j++)

            {

                if(data[j].g==i/*       */)

                {

                    copy_tuple(&data[j],&subData[sublen++]);

                }

            }

			if(sublen==0)continue;

            newNode->child[i]=build_tree(subData,sublen,i,-1);//         ，    gap     

        }

    }

    else

    {

        //      

        /*

         1.        

         2.      left right    

         */

        double gap=choose_best_gap(data,len);//          

		newNode->gap=gap;

		//printf("best gap: %lf
",gap);

        tuple leftData[100],rightData[100];//     ，          

        int leftlen=0;//       

        int rightlen=0;

        for(i=0;i<len;i++)

        {

            if(data[i].h<=gap)

            {

                copy_tuple(&data[i],&leftData[leftlen++]);

            }

            else{

                copy_tuple(&data[i],&rightData[rightlen++]);

            }

        }

		if(leftlen!=0)

        newNode->child[0]=build_tree(leftData,leftlen,-1,gap);//        ，         

		if(rightlen!=0)

        newNode->child[1]=build_tree(rightData,rightlen,-1,gap);

    }

    return newNode;

}

int check_attribute(tuple *data,int len)//             

{

    /*

     1.       ，            ，   

     */

    int i;

    for(i=1;i<len;i++)

    {

        if(data[i].c!=data[i-1].c)return 0;

    }

    return data[0].c+1;

}

double cal_grainRatio(tuple *data, int len)//            

{

    /*

     1.        

     2.                  

     3.             

     4.        

     5.    

     6.       

     */

    double preEntropy=cal_entropy(data, len);

	//printf("in function:preEntropy: %lf
",preEntropy);

    int i,j;//   

	/*for(i=0;i<len;i++)

	{

		//printf("data: %d
",data[i].i);

	}*/

    double entropy=0.0;

    for(i=0;i<2;i++)

    {

        tuple subData[100];

        int sublen=0;

        int cnt=0;

        for(j=0;j<len;j++)///         ，    ，          

        {

            if(data[j].g==i)

            {

                cnt++;

				//printf("data: %d
",data[j].i);

                copy_tuple(&data[j],&subData[sublen++]);//   j   i，  

            }

        }

		/*printf("data:");

		for(j=0;j<sublen;j++)

		{

			printf("%d ",subData[j].i);

		}printf("
");*/

		//printf("entropy: %lf
",cal_entropy(subData, sublen));

        entropy=entropy+cnt*1.0/len*cal_entropy(subData, sublen);

    }

    double splitInfo=cal_splitInfo(data,len);

	//printf("in function:entropy: %lf
",entropy);

	//printf("in function:splitInfo: %lf
",splitInfo);

    return (preEntropy-entropy)/splitInfo;

}

double choose_best_gap(tuple *data,int len)

{

    int i;

    qsort(data,len,sizeof(tuple),cmp);

    double maxGap=-1;

    double maxRaito=-1;

    for(i=1;i<len;i++)

    {

        double gap=(data[i-1].h+data[i].h)/2;// int  double

        double ratio=cal_grainRatio2(data,len,gap);//     gap         ratio

		//printf("ratio: %lf
",ratio);

		//printf("gap: %lf
",gap);

        if(maxRaito<ratio)///        

        {

            maxRaito=ratio;

			maxGap=gap;//         gap

        }

    }

    return maxGap;

}

double cal_grainRatio2(tuple *data,int len,double gap)//          

{

    /*

     1.        

     2.                  

     3.             

     4.        

     5.    

     6.       

     */

    int i;//   

    double preEntropy=cal_entropy(data, len);

    tuple smaller[100],larger[100];

    int leftlen=0,rightlen=0;

    for(i=0;i<len;i++)

    {

        if(data[i].h<=gap)

        {

            copy_tuple(&data[i], &smaller[leftlen++]);

        }

        else

        {

            copy_tuple(&data[i], &larger[rightlen++]);

        }

    }

    double leftEntropy=cal_entropy(smaller, leftlen);

    double rightEntropy=cal_entropy(larger, rightlen);

    double tol=0;

    tol=tol+leftlen*1.0/len*leftEntropy+rightlen*1.0/len*rightEntropy;

    tol=preEntropy-tol;

    double splitInfo=cal_splitInfo(data, len);

    return tol/splitInfo;

}



double cal_entropy(tuple *data,int len)

{

    /*

     1.         

     */

    int i,j;

    double result=0.0;

    for(i=0;i<3;i++)

    {

        int cnt=0;

        for(j=0;j<len;j++)

        {

            if(data[j].c==i)

            {

                cnt++;

            }

        }

		if(cnt==0)continue;//            

		//printf("cnt: %d
",cnt);

        double temp=log(cnt*1.0/len)/log(2);

		//printf("temp: %lf
",temp);

        result=result-cnt*1.0/len*temp;

		//printf("result: %lf
",result);

    }

    return result;

}

double cal_splitInfo(tuple *data,int len)//        

{

    double result=0.0;

    int i,j;//   

    for(i=0;i<2;i++)

    {

        int cnt=0;

        for(j=0;j<len;j++)

        {

            if(data[j].g==i)

            {

                cnt++;

            }

        }

        double temp=log(cnt*1.0/len)/log(2);

        result=result-cnt*1.0/len*temp;

    }

    return result;

}

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

【Codility Lesson3】FrogJmp

A small frog wants to get to the other side of the road. The frog is currently located at position X and wants to get to...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

C 4.5 알고리즘 분류 기

좋은 웹페이지 즐겨찾기