데이터 마 이 닝 의 FP - tree 알고리즘

알고리즘 세부 사항 은 논문 참조: Mining Frequent Patterns Without Candidata Generation
이 논문 은 Jiawei Han 의 대작 으로 또 한 명의 중국인 이 북 미 에 갔다.
그래 픽 인터페이스 프로젝트 + 테스트 용례 스탬프 여기http://download.csdn.net/detail/michealtx/4266155
콘 솔 버 전 C + + 코드 는 다음 과 같 습 니 다.
#include <iostream>
#include <sstream>
#include <fstream>
#include <vector>
#include <map>
#include <list>
#include <string>
#include <algorithm>
#include <ctime>
using namespace std;

struct FPNode
{
	string nodeName;
	unsigned int count;
	
	FPNode *parentNode;
	vector<FPNode *> childNode;
	FPNode *siblingNode;
	
	bool operator < (const FPNode &other) const
	{
		return count>other.count;
	}
};

//              database ,fileName   char* ,   string   ,in()  
bool ObtainDatabase(vector<vector<string> > &database,char *fileName)
{
 /*   vector<string> data;
	data.push_back(1);data.push_back(2);data.push_back(5);
	database.push_back(data);
	
	data.clear();
	data.push_back(2);data.push_back(4);
	database.push_back(data);
	
	data.clear();
	data.push_back(2);data.push_back(3);
	database.push_back(data);
	
	data.clear();
	data.push_back(1);data.push_back(2);data.push_back(4);
	database.push_back(data);
	
	data.clear();
	data.push_back(1);data.push_back(3);
	database.push_back(data);
	
	data.clear();
	data.push_back(2);data.push_back(3);
	database.push_back(data);
	
	data.clear();
	data.push_back(1);data.push_back(3);
	database.push_back(data);
	
	data.clear();
	data.push_back(1);data.push_back(2);data.push_back(3);data.push_back(5);
	database.push_back(data);
	
	data.clear();
	data.push_back(1);data.push_back(2);data.push_back(3);
	database.push_back(data);
	return true;
*/
	ifstream in(fileName);
	if(!in)
	{
		cout<<"      !"<<endl;
		return false;
	}
	
	string s="";
	int i=0;
	while(getline(in,s))
	{//      
	i++;
		vector<string> transaction;	
		int len=s.length();
		string str="";
		for(int i=0;i<len;i++)
		{//          
			if(s[i]!=' ')
			{
				str+=s[i];
			}
			else if(s[i]==' '||i==len-1)
			{		
				transaction.push_back(str);
				str="";
			}
		}
		database.push_back(transaction);
		s="";
	}cout<<i;system("pause");
	return true;
	
}
///////////////////////////////////////////

//       ,  1-    
void CreateItemset(vector<vector<string> > &database,vector<FPNode> &largeItemset,unsigned int minSupport)
{
	map<string,int> dir;
	map<string,int>::iterator dirIt;
	
	vector<vector<string> >::iterator databaseIt;
	
	vector<string> temp;
	vector<string>::iterator tempIt;
	
	//         ,     <item,count>
	for(databaseIt=database.begin();databaseIt!=database.end();databaseIt++)
	{
		temp=*databaseIt;
		for(tempIt=temp.begin();tempIt!=temp.end();tempIt++)
		{
			string item=*tempIt;
			dirIt=dir.find(item);
			if(dirIt==dir.end())
			{//item    dir 
				dir.insert(pair<string,int>(item,1));
			}
			else
			{//item   dir ,   count  1
				(dirIt->second)++;
			}
		}
	}
	
	//           minSopport item
	for(dirIt=dir.begin();dirIt!=dir.end();dirIt++)
	{
		if(dirIt->second>=minSupport)
		{
			FPNode large;
			large.nodeName=dirIt->first;
			large.count=dirIt->second;
			large.parentNode=NULL;
			(large.childNode).clear();
			large.siblingNode=NULL;
			largeItemset.push_back(large);
		}
	}
	//       count       
	sort(largeItemset.begin(),largeItemset.end());
}

//       
void OutputLargeItemset(vector<FPNode> &largeItemset,unsigned int i)
{
	cout<<"   "<<largeItemset.size()<<"    "<<i<<"-    :"<<endl;
	
	vector<FPNode>::iterator largeItemsetIt;
	int j=0;
	for(largeItemsetIt=largeItemset.begin();largeItemsetIt!=largeItemset.end();largeItemsetIt++)
	{
		FPNode temp=*largeItemsetIt;
		cout<<"{ ";
		cout<<temp.nodeName<<" : "<<temp.count;
		cout<<" }";
		j++;
		if(j%4==0)
		{
			cout<<endl;
		}
	}
	cout<<endl<<endl;
}


//////////////////////////////////////////////////////////
//        trans    1-    large1     ,   1-    large1       ,  tempTrans 
void SortItem(vector<string> &trans,vector<string> &tempTrans,vector<FPNode> &large1)
{
	
	unsigned int sizeLarge=large1.size();
	unsigned int sizeTrans=trans.size();
	
	for(int i=0;i<sizeLarge;i++)
	{
		FPNode tempNode=large1[i];
		for(int j=0;j<sizeTrans;j++)
		{
			if(tempNode.nodeName==trans[j])
			{
				tempTrans.push_back(tempNode.nodeName);
			}
		}
	}
}


//    transTemp[k]         root      
int AddNode(FPNode *parentNode,vector<string> &transTemp,unsigned int k,vector<FPNode> &large1)
{
	unsigned int size=transTemp.size();
	if(size>0&&(k>=0&&k<size))
	{//cout<<"addNode      :"<<parentNode->nodeName<<endl;
		FPNode *nodeTemp=new FPNode;//     
		if(nodeTemp==NULL)
		{
			return 1;
		}
		nodeTemp->nodeName=transTemp[k];
		nodeTemp->count=1;
		nodeTemp->parentNode=parentNode;
		nodeTemp->siblingNode=NULL;
		
		(parentNode->childNode).push_back(nodeTemp);
		
		// nodeTemp          。
		unsigned int sizeLarge=large1.size();
		for(int i=0;i<sizeLarge;i++)
		{
			if((large1[i]).nodeName==nodeTemp->nodeName)
			{
				FPNode *temp=&(large1[i]);
				while(temp->siblingNode!=NULL)
				{
					temp=temp->siblingNode;
				}
				temp->siblingNode=nodeTemp;//cout<<"brother";system("pause");
				break;
			}
		}
		
		k++;
		if(k<size)
		{//         
			AddNode(nodeTemp,transTemp,k,large1);
		}
	}
	return 0;
}

//  FP-Tree
int CreateFPTree(vector<vector<string> > &database,vector<FPNode> &large1,FPNode **treeRoot)
{
	*treeRoot=new FPNode;
	FPNode *root=*treeRoot;
	if(root==NULL)
	{
		return 1;
	}
	root->nodeName="-1";//-1     
	root->count=0;
	root->parentNode=NULL;
	(root->childNode).clear();
	root->siblingNode=NULL;
	
	unsigned int sizeDatabase=database.size();
	for(int i=0;i<sizeDatabase;i++)
	{
		vector<string> trans=database[i];//         
		vector<string> transTemp;
		SortItem(trans,transTemp,large1);//       large1          ,  transTemp 
/*
//    trans transTemp
int da=trans.size();
cout<<"trans:";
for(int i=0;i<da;i++)cout<<trans[i]<<" ";
cout<<endl<<"transTemp:";
da=transTemp.size();
for(int i=0;i<da;i++)cout<<transTemp[i]<<" ";
cout<<endl;
//system("pause");
*/
		vector<FPNode *> childNode=root->childNode;//          
//cout<<"      "<<childNode.size()<<"    "<<endl;
		if(childNode.size()==0)
		{//             ,       。          root   。
			AddNode(root,transTemp,0,large1);//cout<<(root->childNode).size()<<"    "<<endl;
		}
		else
		{//    
			unsigned int sizeTrans=transTemp.size();
			FPNode *pt=NULL;
			for(int i=0;i<sizeTrans;i++)
			{//   transTemp[i]        
				string temp=transTemp[i];
				
				unsigned int sizeChild=childNode.size();
				
				int j=0;
				for(j=0;j<sizeChild;j++)
				{
					if(temp==(childNode[j])->nodeName)
					{
						pt=childNode[j];
						(pt->count)++;//transTemp[i]         ,        1
		//cout<<"  transTemp[i] "<<pt->nodeName<<endl;
						childNode=pt->childNode;
						break;
					}
				}
				
				if(j==sizeChild)
				{
					if(pt==NULL)
					{//transTemp[i]    ,     i         ,      
						AddNode(root,transTemp,i,large1);
					}
					else
					{//transTemp[i]       ,         ,              
						AddNode(pt,transTemp,i,large1);//      i+1,       !
					}
					break;
				}
			}
		}
	}

	return 0;
}
/////////////////////////////////////////////////////////////////
//  large1          conditional pattern base     。                 。
void CreateConditionalPatterBase(vector<FPNode> &large1,map<string,vector<vector<string> > > &cpbMap)
{//if(large1.empty()){cout<<"large1 is empty"<<endl;}else{cout<<"large1 is not empty"<<endl;}
	vector<string> condition;
	int sizeLarge=large1.size();
	//large1       (                  ,      ,      ),        
	for(int i=sizeLarge-1;i>0;i--)
	{
		FPNode *sibling=(large1[i]).siblingNode;
		//if(sibling==NULL)cout<<"*****"<<i<<endl;system("pause");
		vector<vector<string> > pathSet;
		pathSet.clear();
		while(sibling!=NULL)
		{
			unsigned int count=sibling->count;
			FPNode *parent=sibling->parentNode;
			vector<string> path;
			path.clear();
			while(parent!=NULL)
			{//    ,               path 
				if(parent->parentNode!=NULL)
				{//parent      ,  parent   ,      
					path.push_back(parent->nodeName);
					parent=parent->parentNode;
				}
				else
				{
					break;
				}
			}
			
			if(!(path.empty()))
			{
				for(int j=0;j<count;j++)
				{
					pathSet.push_back(path);	
				}
			}
			
			sibling=sibling->siblingNode;
		}
		if(pathSet.size()!=0)
		{
			cpbMap.insert(pair<string,vector<vector<string> > >((large1[i]).nodeName,pathSet));
		}
	}
}
//////////////////////////////////////////////////////////
void DestroyTree(FPNode *root)
{
	if(root!=NULL)
	{
		vector<FPNode *> childNode=root->childNode;
		unsigned int size=childNode.size();
		for(int i=0;i<size;i++)
		{
			DestroyTree(childNode[i]);
		}
		delete root;
	}
	root=NULL;
}


//  FP 
void OutputFPTree(FPNode *root)
{
	if(root==NULL)
	{//cout<<"root is null"<<endl;
		return;
	}
	//cout<<"root not null"<<endl;
	vector<FPNode*> childNode=root->childNode;
	
	
	
	cout<<"{"<<root->nodeName<<","<<root->count<<","<<childNode.size();
	if(childNode.size()!=0)
	{
		cout<<"}    :"<<endl;
		for(int i=0;i<childNode.size();i++)
		{
			OutputFPTree(childNode[i]);
		}
	}
	else 
	{
		cout<<"}  "<<endl;
	}
	
}


//                   FP ,        FP   
void CreateCpbFPtree(map<string,vector<vector<string> > > &cpbMap,map<string,FPNode *> &cpbFPTreeMap,unsigned int minSupport)
{
	map<string,vector<vector<string> > >::iterator cpbMapIt;
	for(cpbMapIt=cpbMap.begin();cpbMapIt!=cpbMap.end();cpbMapIt++)
	{
		string nodeName=cpbMapIt->first;
		vector<vector<string> > pathSet=cpbMapIt->second;
		
		vector<FPNode> cpbLarge;
		//  nodeName           pathSet         cpbLarge
		CreateItemset(pathSet,cpbLarge,minSupport);
		//            
		//cout<<"

"<<nodeName<<" "<<endl; //OutputLargeItemset(cpbLarge,1); //if(pathSet.empty()){cout<<"pathSet is null"<<endl;}else{cout<<"pathSet not null"<<endl;} //if(cpbLarge.empty()){cout<<"cpbLarge is null"<<endl;}else{cout<<"cpbLarge not null"<<endl;system("pause");} FPNode *cpbRoot=NULL; if(!(pathSet.empty())&&!(cpbLarge.empty())) { CreateFPTree(pathSet,cpbLarge,&cpbRoot);// nodeName FP //cout<<"

"<<nodeName<<" FP "<<endl; //OutputFPTree(cpbRoot); } if(cpbRoot!=NULL) {// vector<FPNode *> *childNode=&(cpbRoot->childNode); vector<FPNode *>::iterator childIt=(*childNode).begin(); unsigned int size=(*childNode).size(); for(int i=0;i<size;i++) { if((*childIt)->count< minSupport) {// //cout<<endl<<(*childIt)->nodeName<<" "<<(*childIt)->count<<" EEEEEEEEEEEERROR"<<endl;system("pause"); DestroyTree(*childIt); childIt=(*childNode).erase(childIt);//cout<<endl<<(*childIt)->nodeName<<" "<<(*childIt)->count<<" EEEEEEEEEEEERROR"<<endl;system("pause"); } else { childIt++; } } if((*childNode).size()!=0) {// 0, FP cpbFPTreeMap.insert(pair<string,FPNode*>(nodeName,cpbRoot)); } } } } ////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////// // FP , pathSet void CreatePath(FPNode *root,vector<list<FPNode> > &pathSet) { //vector<list<FPNode> > pathSet=new vector<list<FPNode> >(); if(root!=NULL) { if((root->childNode).empty()) {//root list<FPNode> path; if(root->parentNode!=NULL) {// path.push_back(*root);//cout<<"root!=NULL"<<"--"<<root->nodeName<<" "<<root->count<<" "<<(root->childNode).size()<<endl; } if(path.size()!=0) { pathSet.push_back(path); } } else {//root vector<FPNode *> childNode=root->childNode; unsigned int size=childNode.size(); if(size>1) {//root for(int i=0;i<size;i++) { unsigned int count=childNode[i]->count; CreatePath(childNode[i],pathSet); unsigned int cnt=(childNode[i]->childNode).size(); if(cnt==0) { cnt=1; } if(root->parentNode!=NULL) { unsigned int sizePathSet=pathSet.size(); for(int j=sizePathSet-cnt;j<sizePathSet;j++) {//cout<<"size>1"<<"--"<<root->nodeName<<" "<<root->count<<" "<<(root->childNode).size()<<endl; list<FPNode> *lis=&(pathSet[j]); FPNode *node=new FPNode; node->nodeName=root->nodeName; node->count=count; node->parentNode=NULL; lis->push_front(*node); } } } } else {//root for(int i=0;i<size;i++) { unsigned int count=childNode[i]->count; CreatePath(childNode[i],pathSet); if(root->parentNode!=NULL) { unsigned int sizePathSet=pathSet.size(); for(int j=sizePathSet-1;j<sizePathSet;j++) {//cout<<"size==1"<<"--"<<root->nodeName<<" "<<root->count<<" "<<(root->childNode).size()<<endl; list<FPNode> *lis=&(pathSet[j]); lis->push_front(*root); } } } } } } //return pathSet; } // void OutputPathSet(vector<list<FPNode> > &pathSet) { vector<list<FPNode> >::iterator pathSetIt; for(pathSetIt=pathSet.begin();pathSetIt!=pathSet.end();pathSetIt++) { list<FPNode> path=*pathSetIt; list<FPNode>::iterator pathIt; for(pathIt=path.begin();pathIt!=path.end();pathIt++) { cout<<(*pathIt).nodeName<<" "; } cout<<endl<<endl; } } // path fp,k path.size()-1 map<list<string>,int>* CreateFP(list<FPNode> &path)//list<FPNode>::reverse_iterator pathIt,int k,vector<vector<string> > &fp) { if(path.size()>0) { // path FPNode *temp=&(path.front()); if(!temp) { return NULL; } list<string> lis; string start=temp->nodeName; lis.push_back(start); map<list<string>,int> *fp=new map<list<string>,int>; if(!fp) { return NULL; } fp->insert(pair<list<string>,int>(lis,temp->count)); path.pop_front(); map<list<string>,int> *others=CreateFP(path); if(others) { map<list<string>,int>::iterator othersIt=others->begin(); for(;othersIt!=others->end();othersIt++) { fp->insert(pair<list<string>,int>(othersIt->first,othersIt->second)); const list<string> *li=&(othersIt->first); list<string> tmp; list<string>::const_iterator liIt=li->begin(); while(liIt!=li->end()) { tmp.push_back(*liIt); liIt++; } tmp.push_front(start); fp->insert(pair<list<string>,int>(tmp,othersIt->second)); } } return fp; } return NULL; } // FP , void ObtainFrequenPattern(map<string,FPNode *> &cpbFPTreeMap,map<string,map<list<string>,int>* > &frequentPatternMap) {//if(cpbFPTreeMap.empty()){cout<<"cpbFPTreeMap is empty"<<endl;} map<string,FPNode *>::iterator cpbTreeIt; for(cpbTreeIt=cpbFPTreeMap.begin();cpbTreeIt!=cpbFPTreeMap.end();cpbTreeIt++) { string nodeName=cpbTreeIt->first; FPNode *cpbRoot=cpbTreeIt->second; // nodeName pathSet vector<list<FPNode> > pathSet; list<FPNode> path; // FP // cout<<endl<<nodeName<<" FP :"<<endl; // OutputFPTree(cpbRoot); CreatePath(cpbRoot,pathSet); // // cout<<endl<<nodeName<<" :"<<endl; // OutputPathSet(pathSet);system("pause"); vector<list<FPNode> >::iterator pathSetIt; for(pathSetIt=pathSet.begin();pathSetIt!=pathSet.end();pathSetIt++) { list<FPNode> pathTemp=*pathSetIt; // , fp map<list<string>,int>* fp=NULL; fp=CreateFP(pathTemp); // cout<<"CreateFP finish"<<endl; if(fp==NULL) { // cout<<"fp is null"<<endl; continue; } map<string,map<list<string>,int>* >::iterator fpmIt; fpmIt=frequentPatternMap.find(nodeName); if(fpmIt==frequentPatternMap.end()) {// nodeName , fp frequentPatternMap.insert(pair<string,map<list<string>,int>* >(nodeName,fp)); } else { map<list<string>,int>::iterator fpIt=fp->begin(); unsigned int sizeFp=fp->size(); map<list<string>,int>* pt=fpmIt->second; map<list<string>,int>::iterator ptIt=pt->begin(); unsigned int sizePt=pt->size(); for(;fpIt!=fp->end();++fpIt) {// fp list pt list bool flag=true; int j=0; for(ptIt=pt->begin(),j=0;j<sizePt&&ptIt!=pt->end();++j,++ptIt) { list<string>::const_iterator t1It=(fpIt->first).begin(); unsigned int sizeT1=(fpIt->first).size(); list<string>::const_iterator t2It=(ptIt->first).begin(); unsigned int sizeT2=(ptIt->first).size(); if(sizeT1!=sizeT2) { flag=false; continue; } for(int k=0;k<sizeT1;k++) { if(*t1It!=*t2It) { flag=false; break; } t1It++; t2It++; if(k==sizeT1-1) { flag=true; } } if(flag==true) { break; } } if(flag==true) { ptIt->second+=fpIt->second; } else { pt->insert(pair<list<string>,int>(fpIt->first,fpIt->second)); } } } } } } // void OutputFrequentPattern(map<string,map<list<string>,int>* > &frequentPatternMap,unsigned int minSupport) {//if(frequentPatternMap.empty()){cout<<"frequentPatternMap is empty"<<endl;} map<string,map<list<string>,int>* >::iterator it; int geshu=0; for(it=frequentPatternMap.begin();it!=frequentPatternMap.end();it++) {geshu++; string nodeName=it->first; map<list<string>,int>* fpSet=it->second; unsigned int size=fpSet->size(); map<list<string>,int>::iterator fpSetIt=fpSet->begin(); cout<<nodeName<<":"<<endl; for(int i=0;i<size;++i,++fpSetIt) { const list<string> *fp=&(fpSetIt->first); unsigned int cnt=fpSetIt->second; if(cnt>=minSupport) { unsigned int len=fp->size(); list<string>::const_iterator fpIt=fp->begin(); cout<<"{ "; for(int j=0;j<len;j++,++fpIt) { cout<<*fpIt<<" "; } cout<<nodeName<<" }"<<endl; } } }cout<<geshu<<"---"; } // void OutputConditionalPatterBase(map<string,vector<vector<string> > > &cpbMap) { map<string,vector<vector<string> > >::iterator cpbMapIt; for(cpbMapIt=cpbMap.begin();cpbMapIt!=cpbMap.end();cpbMapIt++) { string nodeName=cpbMapIt->first; vector<vector<string> > cpbSet=cpbMapIt->second; vector<vector<string> >::iterator cpbSetIt; cout<<nodeName<<" "<<cpbSet.size()<<" :"<<endl; for(cpbSetIt=cpbSet.begin();cpbSetIt!=cpbSet.end();cpbSetIt++) { vector<string> cpb=*cpbSetIt; vector<string>::iterator cpbIt; for(cpbIt=cpb.begin();cpbIt!=cpb.end();cpbIt++) { cout<<*cpbIt<<" "; } cout<<endl; } } } // FP void OutputCpbFPTree(map<string,FPNode *> &cpbFPTreeMap) { map<string,FPNode *>::iterator cpbFPTreeMapIt; for(cpbFPTreeMapIt=cpbFPTreeMap.begin();cpbFPTreeMapIt!=cpbFPTreeMap.end();cpbFPTreeMapIt++) { string nodeName=cpbFPTreeMapIt->first; FPNode *root=cpbFPTreeMapIt->second; cout<<nodeName<<" FP :"<<endl; OutputFPTree(root); } } int main() { char *fileName="retail.dat"; int minSupport=6171;// clock_t start=clock(); vector<vector<string> > database;// ObtainDatabase(database,fileName); vector<FPNode> large1; CreateItemset(database,large1,minSupport); int k=1; vector<FPNode> largeTemp=large1; // OutputLargeItemset(largeTemp,k); FPNode *root=NULL; // FPTree CreateFPTree(database,large1,&root); // FP //OutputFPTree(root); //if(root==NULL)cout<<"root is null"<<endl; map<string,vector<vector<string> > > cpbMap; // large1 conditional pattern base 。 。 CreateConditionalPatterBase(large1,cpbMap); // //OutputConditionalPatterBase(cpbMap); //if(cpbMap.empty()){cout<<"cpbMap is empty"<<endl;}/////////////////////////////////////////////////////////////////////////////////// map<string,FPNode *> cpbFPTreeMap; // FP , FP CreateCpbFPtree(cpbMap,cpbFPTreeMap,minSupport); // FP //OutputCpbFPTree(cpbFPTreeMap); map<string,map<list<string>,int>* > frequentPatternMap; // FP , ObtainFrequenPattern(cpbFPTreeMap,frequentPatternMap); // OutputFrequentPattern(frequentPatternMap,minSupport); clock_t end=clock(); cout<<"Finish! :"<<(end-start)<<"ms"<<endl; system("pause"); }

좋은 웹페이지 즐겨찾기