DNA 서열 분류: Fisher 판별 법

#include <stdio.h>
#include <string.h>
#include <math.h>

#define DNA_A 'a'
#define DNA_T 't'
#define DNA_C 'c'
#define DNA_G 'g'

#define DNA_AN 0
#define DNA_TN 1
#define DNA_CN 2
#define DNA_GN 3

int main()
{
	int Clover[64];
	char AminoAcids[21][25];
	int i, j, count, ich, length[20], len, k, p, q;
	char dnaArray[20][200];	//DNA  
	int num[60][22], ClovNum[60];
	float percent[60][22];
	FILE *file;
	float distance[60], disMin[3], dis;
	int sel[20];
	double JD[22], temp;			//    
	int dSite[22], tempInt, featureSel[8];
	float percentY[20], evgM[2][8], tildeM[2], evgMinus[8];
	float perMinusA[10][8], perMinusB[10][8], stanA[8][8], stanB[8][8], stanZZ[8][8], stanInv[8][8];
	float disperS[2][8], disperSW[8];
	float omega[8], thresY;
	float testYY[20];
	int testInt[20];

	strcpy(AminoAcids[0], "aaaaag");	
	strcpy(AminoAcids[1], "aataacgaagacgatgag");
	strcpy(AminoAcids[2], "taatagtat");
	strcpy(AminoAcids[3], "tac");
	strcpy(AminoAcids[4], "caacagcatcac");
	strcpy(AminoAcids[5], "agaagcagtaggtcgtca");
	strcpy(AminoAcids[6], "ggaggcggtggg");
	strcpy(AminoAcids[7], "tgatgctgttgg");
	strcpy(AminoAcids[8], "cgacgccgtcgg");
	strcpy(AminoAcids[9], "ataatg");
	strcpy(AminoAcids[10], "gtagtg");
	strcpy(AminoAcids[11], "gttgtc");
	strcpy(AminoAcids[12], "ttattg");
	strcpy(AminoAcids[13], "tttttc");
	strcpy(AminoAcids[14], "ctactg");
	strcpy(AminoAcids[15], "cttctc");
	strcpy(AminoAcids[16], "acaacg");
	strcpy(AminoAcids[17], "acc");
	strcpy(AminoAcids[18], "gcagccgctgcgtcttcc");
	strcpy(AminoAcids[19], "ccaccgcctccc");
	strcpy(AminoAcids[20], "attatcact");

	strcpy(dnaArray[0], "aggcacggaaaaacgggaataacggaggaggacttggcacggcattacacggaggacgaggtaaaggaggcttgtctacggccggaagtgaagggggatatgaccgcttgg");
	strcpy(dnaArray[1], "cggaggacaaacgggatggcggtattggaggtggcggactgttcggggaattattcggtttaaacgggacaaggaaggcggctggaacaaccggacggtggcagcaaagga");
	strcpy(dnaArray[2], "gggacggatacggattctggccacggacggaaaggaggacacggcggacatacacggcggcaacggacggaacggaggaaggagggcggcaatcggtacggaggcggcgga");
	strcpy(dnaArray[3], "atggataacggaaacaaaccagacaaacttcggtagaaatacagaagcttagatgcatatgttttttaaataaaatttgtattattatggtatcataaaaaaaggttgcga");
	strcpy(dnaArray[4], "cggctggcggacaacggactggcggattccaaaaacggaggaggcggacggaggctacaccaccgtttcggcggaaaggcggagggctggcaggaggctcattacggggag");
	strcpy(dnaArray[5], "atggaaaattttcggaaaggcggcaggcaggaggcaaaggcggaaaggaaggaaacggcggatatttcggaagtggatattaggagggcggaataaaggaacggcggcaca");
	strcpy(dnaArray[6], "atgggattattgaatggcggaggaagatccggaataaaatatggcggaaagaacttgttttcggaaatggaaaaaggactaggaatcggcggcaggaaggatatggaggcg");
	strcpy(dnaArray[7], "atggccgatcggcttaggctggaaggaacaaataggcggaattaaggaaggcgttctcgcttttcgacaaggaggcggaccataggaggcggattaggaacggttatgagg");
	strcpy(dnaArray[8], "atggcggaaaaaggaaatgtttggcatcggcgggctccggcaactggaggttcggccatggaggcgaaaatcgtgggcggcggcagcgctggccggagtttgaggagcgcg");
	strcpy(dnaArray[9], "tggccgcggaggggcccgtcgggcgcggatttctacaagggcttcctgttaaggaggtggcatccaggcgtcgcacgctcggcgcggcaggaggcacgcgggaaaaaacg");
	strcpy(dnaArray[10], "gttagatttaacgttttttatggaatttatggaattataaatttaaaaatttatattttttaggtaagtaatccaacgtttttattactttttaaaattaaatatttatt");
	strcpy(dnaArray[11], "gtttaattactttatcatttaatttaggttttaattttaaatttaatttaggtaagatgaatttggttttttttaaggtagttatttaattatcgttaaggaaagttaaa");
	strcpy(dnaArray[12], "gtattacaggcagaccttatttaggttattattattatttggattttttttttttttttttttaagttaaccgaattattttctttaaagacgttacttaatgtcaatgc");
	strcpy(dnaArray[13], "gttagtcttttttagattaaattattagattatgcagtttttttacataagaaaatttttttttcggagttcatattctaatctgtctttattaaatcttagagatatta");
	strcpy(dnaArray[14], "gtattatatttttttatttttattattttagaatataatttgaggtatgtgtttaaaaaaaatttttttttttttttttttttttttttttttaaaatttataaatttaa");
	strcpy(dnaArray[15], "gttatttttaaatttaattttaattttaaaatacaaaatttttactttctaaaattggtctctggatcgataatgtaaacttattgaatctatagaattacattattgat");
	strcpy(dnaArray[16], "gtatgtctatttcacggaagaatgcaccactatatgatttgaaattatctatggctaaaaaccctcagtaaaatcaatccctaaacccttaaaaaacggcggcctatccc");
	strcpy(dnaArray[17], "gttaattatttattccttacgggcaattaattatttattacggttttatttacaattttttttttttgtcctatagagaaattacttacaaaacgttattttacatactt");
	strcpy(dnaArray[18], "gttacattatttattattatccgttatcgataattttttacctcttttttcgctgagtttttattcttactttttttcttctttatataggatctcatttaatatcttaa");
	strcpy(dnaArray[19], "gtatttaactctctttactttttttttcactctctacattttcatcttctaaaactgtttgatttaaacttttgtttctttaaggattttttttacttatcctctgttat");

	//         ,          
	for(i=0; i<64; i++)
	{
		Clover[i] = -1;
	}
	for(i=0; i<21; i++)
	{
		len = strlen(AminoAcids[i]);
		for(j=0; j<len; j+=3)
		{
			count = 0;
			ich = AminoAcids[i][j];
			switch(ich)
			{
			case 97:
				count += DNA_AN;
				break;
			case 116:
				count += DNA_TN;
				break;
			case 99:
				count += DNA_CN;
				break;
			case 103:
				count += DNA_GN;
				break;
			}
			ich = AminoAcids[i][j+1];
			switch(ich)
			{
			case 97:
				count += DNA_AN * 4;
				break;
			case 116:
				count += DNA_TN * 4;
				break;
			case 99:
				count += DNA_CN * 4;
				break;
			case 103:
				count += DNA_GN * 4;
				break;
			}
			ich = AminoAcids[i][j+2];
			switch(ich)
			{
			case 97:
				count += DNA_AN * 16;
				break;
			case 116:
				count += DNA_TN * 16;
				break;
			case 99:
				count += DNA_CN * 16;
				break;
			case 103:
				count += DNA_GN * 16;
				break;
			}
			Clover[count] = i;
		}
	}

	for(i=0; i<64; i++)
	{
		//printf("%d
", Clover[i]); } // 20*3 for(i=0; i<60; i++) { ClovNum[i] = 0; for(j=0; j<22; j++) { num[i][j] = 0; percent[i][j] = 0; } } for(i=0; i<20; i++) { length[i] = strlen(dnaArray[i]); for(k=0; k<3; k++) { for(j=k; j<length[i]; j+=3) { count = 0; ich = dnaArray[i][j-2]; switch(ich) { case 97: count += DNA_AN; break; case 116: count += DNA_TN; break; case 99: count += DNA_CN; break; case 103: count += DNA_GN; break; } ich = dnaArray[i][j-1]; switch(ich) { case 97: count += DNA_AN * 4; break; case 116: count += DNA_TN * 4; break; case 99: count += DNA_CN * 4; break; case 103: count += DNA_GN * 4; break; } ich = dnaArray[i][j]; switch(ich) { case 97: count += DNA_AN * 16; break; case 116: count += DNA_TN * 16; break; case 99: count += DNA_CN * 16; break; case 103: count += DNA_GN * 16; break; } num[3*i+k][Clover[count]] ++; ClovNum[3*i+k]++; if(dnaArray[i][j-2] == 'a' || dnaArray[i][j-2] == 't') { num[3*i+k][21] ++; } if(dnaArray[i][j-1] == 'a' || dnaArray[i][j-1] == 't') { num[3*i+k][21] ++; } if(dnaArray[i][j] == 'a' || dnaArray[i][j] == 't') { num[3*i+k][21] ++; } } } } // for(i=0; i<60; i++) { for(j=0; j<21; j++) { percent[i][j] = num[i][j] * 1.0 / ClovNum[i]; } percent[i][21] = num[i][21] * 1.0 / length[i/3]; } file = fopen("FisherNum.txt", "w"); for(i=0; i<60; i++) { fprintf(file, "%d ", ClovNum[i]); for(j=0; j<22; j++) { fprintf(file, "%d ", num[i][j]); } fprintf(file, "
"); } fclose(file); file = fopen("FisherPercent.txt", "w"); for(i=0; i<60; i++) { for(j=0; j<22; j++) { fprintf(file, "%f ", percent[i][j]); } fprintf(file, "
"); } fclose(file); // A、B for(i=0; i<20; i++) { sel[i] = -1; } for(i=0; i<10; i++) { for(k=0; k<3; k++) { disMin[k] = 60000; for(j=0; j<30; j++) { if(i == j/3) continue; dis = 0; for(p=0; p<22; p++) { dis += (percent[3*i+k][p]-percent[j][p]) * (percent[3*i+k][p]-percent[j][p]); } if(disMin[k] > dis) disMin[k] = dis; } } dis = 60000; for(k=0; k<3; k++) { if(disMin[k] < dis) { dis = disMin[k]; sel[i] = 3*i+k; } } } for(i=10; i<20; i++) { for(k=0; k<3; k++) { disMin[k] = 60000; for(j=30; j<60; j++) { if(i == j/3) continue; dis = 0; for(p=0; p<22; p++) { dis += (percent[3*i+k][p]-percent[j][p]) * (percent[3*i+k][p]-percent[j][p]); } if(disMin[k] > dis) disMin[k] = dis; } } dis = 60000; for(k=0; k<3; k++) { if(disMin[k] < dis) { dis = disMin[k]; sel[i] = 3*i+k; } } } for(i=0; i<20; i++) { printf("%d ", sel[i]); } printf("
"); // for(i=0; i<22; i++) { JD[i] = 0; } for(i=0; i<10; i++) { for(j=10; j<20; j++) { for(p=0; p<22; p++) { JD[p] += sqrt((percent[sel[i]][p]-percent[sel[j]][p]) * (percent[sel[i]][p]-percent[sel[j]][p])); } } } for(i=0; i<22; i++) { printf("%f ", JD[i]); dSite[i] = i; } printf("
"); // JD , 8 for(i=0; i<21; i++) for(j=21; j>=i+1; j--) if(JD[j] > JD[j-1]) { temp = JD[j]; JD[j] = JD[j-1]; JD[j-1] = temp; tempInt = dSite[j]; dSite[j] = dSite[j-1]; dSite[j-1] = tempInt; }; // 8 for(i=0; i<8; i++) { featureSel[i] = dSite[i]; printf("%d ", dSite[i]); } printf("
"); // 8 txt file = fopen("feature.txt", "w"); for(i=0; i<10; i++) { fprintf(file, "%2d %2d ", i, 1); for(j=0; j<8; j++) { fprintf(file, "%f ", percent[dSite[i]][featureSel[j]]); } fprintf(file, "
"); } fprintf(file, "
"); for(i=10; i<20; i++) { fprintf(file, "%2d %2d ", i, -1); for(j=0; j<8; j++) { fprintf(file, "%f ", percent[dSite[i]][featureSel[j]]); } fprintf(file, "
"); } fclose(file); //Fisher // for(i=0; i<2; i++) { for(j=0; j<8; j++) evgM[i][j] = 0; } for(i=0; i<10; i++) { for(j=0; j<8; j++) { evgM[0][j] += percent[dSite[i]][featureSel[j]]; } } for(i=10; i<20; i++) { for(j=0; j<8; j++) { evgM[1][j] += percent[dSite[i]][featureSel[j]]; } } for(i=0; i<2; i++) { for(j=0; j<8; j++) { evgM[i][j] /= 10; printf("%f %f
", evgM[0][j], evgM[1][j]); } } // for(i=0; i<10; i++) { for(j=0; j<8; j++) { perMinusA[i][j] = percent[dSite[i]][featureSel[j]] - evgM[0][j]; } } for(i=10; i<20; i++) { for(j=0; j<8; j++) { perMinusB[i-10][j] = percent[dSite[i]][featureSel[j]] - evgM[1][j]; } } for(i=0; i<8; i++) { for(j=0; j<8; j++) { stanA[i][j] = 0; for(p=0; p<10; p++) stanA[i][j] += perMinusA[p][i] * perMinusA[p][j]; } } for(i=0; i<8; i++) { for(j=0; j<8; j++) { stanB[i][j] = 0; for(p=0; p<10; p++) stanB[i][j] += perMinusB[p][i] * perMinusB[p][j]; } } for(i=0; i<8; i++) { for(j=0; j<8; j++) { stanZZ[i][j] = stanA[i][j] + stanB[i][j]; } } file = fopen("stanZZ.txt", "w"); for(i=0; i<8; i++) { for(j=0; j<8; j++) { fprintf(file, "%f ", stanZZ[i][j]); } fprintf(file, "
"); } fclose(file); // stanZZ file = fopen("stanInv.txt", "r"); if(file == NULL) printf("Wrong!
"); for(i=0; i<8; i++) { for(j=0; j<8; j++) { fscanf(file, "%f", &stanInv[i][j]); printf("%f ", stanInv[i][j]); } printf("
"); } fclose(file); // w for(i=0; i<8; i++) { evgMinus[i] = evgM[0][i] - evgM[1][i]; } printf(" :
"); for(i=0; i<8; i++) { omega[i] = 0; for(j=0; j<8; j++) { omega[i] += stanInv[i][j] * evgMinus[j]; } printf("%f
", omega[i]); } // y //d 1 for(j=0; j<20; j++) percentY[j] = 0; for(i=0; i<10; i++) { for(j=0; j<8; j++) { percentY[i] += percent[dSite[i]][featureSel[j]] * omega[j]; } } for(i=10; i<20; i++) { for(j=0; j<8; j++) { percentY[i] += percent[dSite[i]][featureSel[j]] * omega[j]; } } // tildeM[0] = tildeM[1] = 0; for(i=0; i<10; i++) { tildeM[0] += percentY[i]; } for(i=10; i<20; i++) { tildeM[1] += percentY[i]; } tildeM[0] /= 10; tildeM[1] /= 10; thresY = (tildeM[0] + tildeM[1])/2; printf(" Y = %f
", thresY); file = fopen("InfoFun.txt", "w"); for(i=0; i<8; i++) { fprintf(file, "%f ", omega[i]); } fprintf(file, "
%f", thresY); fclose(file); for(i=0; i<20; i++) { testYY[i] = 0; for(j=0; j<8; j++) { testYY[i] += percent[dSite[i]][featureSel[j]] * omega[j]; } if(testYY[i] > thresY) printf("1
"); else printf("0
"); } file = fopen("Eight.txt", "w"); for(i=0; i<20; i++) { for(j=0; j<8; j++) { fprintf(file, "%f ", percent[dSite[i]][featureSel[j]]); } fprintf(file, "
"); } fclose(file); return 0; }

좋은 웹페이지 즐겨찾기