VAD 구현 (3) - 알고리즘 계산 프로 세 스 와 프레임 워 크

VAD 알고리즘 을 구현 하기 전에 여기 서 VAD 알고리즘 의 구현 절차 와 알고리즘 프레임 워 크 를 먼저 제시 합 니 다.
호출 관계 detect_wav -> detect_frame->process_vad->energy_detect，energy_detect 는 비교적 복잡 하기 때문에 잠시 여기에 있 지 않 습 니 다. 이것 은 기본 적 인 계산 절차 와 구 조 를 나타 내 는 것 입 니 다.

int frame_size = 256 ; 
int sample_rate = 8000 ;//      
int frame_step =  80;
int msec_per_frame  = (int) 1000 * frame_size/sample_rate; //         
int silence_filter_len = 30 ; //              ，        
int speech_filter_len = 30 ; //              

int number_of_silence = 0; //          。
int number_of_speech = 0;  //          。
int total_of_silence = 0;  //           
int total_of_speech = 0;   //           

int silence_count = 0; //      ，        ，      
int speech_count = 0;

int frame_count = 0;

위 에 명 시 된 전역 변수 입 니 다.

/**
*
* data：  WAV     。nsample：WAV      。            
*
**/
int detect_wav(short *data,int nsample){
	int state = state_SILENCE ;  //    ：  
	int index = 0,offset = 0;
	int cst = 0; //       ，       
	//index            ，             
	int ret = detect_frame(data,&index); //          ,     256    ，ret        ，  、  、   。

	while(ret != state_OVER){  //       ，       。
		if(ret != state_WAIT){  //           
			if(ret  != st ){ //             
				int k ;
				if(ret == state_SILENCE){
					k = index * frame_step -  silence_filter_len * frame_step ;//           。
					if(k <= cst){ //
						ret = st;
						continue;
					}
					//     
					printf("%3.2f %3.2f Speech
",cst/(float)sample_rate,k/(float)sample_rate);
				}else{
					k = index * frame_step - speech_filter_len  * frame_step;//               
					if(k <= cst){
						ret = st;
						continue;
					}
					printf("%3.2f %3.2f Silence
", cst/(float)sample_rate, ((k/(float)sample_rate)>0?(k/(float)sample_rate):0.00));
				}
				cst = k;   //       K  。
				st = ret ; //       
			}
		}
		//        
		offset += frame_step ; //         ，          
		if(offset <= nsample - frame_size){
			ret = detect_frame(data+offset,&index);
		}else{
			ret = detect_frame(NULL,&index);
		}
	}
        //           
       if (st == state_SPEEACH){
                printf("%3.2f %3.2f SPeech
", cst/(float)sample_rate, nSample/(float)sample_rate);
       }else{
                printf("%3.2f %3.2f Silence
", cst/(float)sample_rate, nSample/(float)sample_rate);
        }
        return 0;
}

위의 함 수 는 모든 데 이 터 를 순환 호출 하고 한 프레임 을 호출 할 때마다 detect 를 호출 합 니 다.frame 처리:

/**
*
*       ，   data   frame_size   。state      
*             
*
**/
int detect_frame(short *data,int *index,int state){
	short *pcm_data = data ;
	int silence_flag = process_vad(pcm_data,index); //       ，           

	//  
	if(silence_flag == state_OVER || silence_flag == state_WAIT )
		return silence_flag ;
	if(silence_flag == state_SILENCE){ //      
		number_of_silence  ++ ;
		total_of_silence  ++ ;
		number_of_speech = 0;
		if(number_of_silence > 5){ //         ，    50ms
			speech_count = 0; //       ，   0；
		}

		//          ，    ，              。
		if(state == state_SPEECH )
			silence_count ++ ;
		/**
		*                  ，     300ms。      
		**/
		if((state == state_SPEECH) && (silence_count > silence_filter_len )){
			state = ((total_of_speech > total_of_silence) ? state_SPEECH : state_SILENCE) ;
			number_of_silence = number_of_speech =total_of_silence = total_of_speech = 0;
		}
	}else{
		total_of_speech ++ ;
		number_of_speech ++ ;
		number_of_silence = 0;
		if(number_of_speech > 5){
			silence_count = 0;
		}
		if(state == state_SILENCE )
			speech_count ++;
		if((state == state_SILENCE ) && ( speech_count > speech_filter_len)){
			state = ((total_of_speech > total_of_silence) ? state_SPEECH : state_SILENCE) ;
			number_of_silence = number_of_speech =total_of_silence = total_of_speech = 0;
		}
	}
	
	frame_count ++; 
	return state;
}

위의 detectframe 은 사실 사후 처리 함수 입 니 다. 현재 프레임 의 가능 한 상 태 를 되 돌려 준 후에 상태 변경 과 계 수 를 하 는 것 입 니 다. 데이터 처리 작업 은 다음 과 같 습 니 다.

int process_vad(short *data,int *index){
	if(data == NULL)
		return state_OVER;
	//      。
	//        total_rms 
	energy_detect(total_rms,flag); //              ,  flag  ，        。
	*index = *index++ ;//      ，       。
	return flag == 0 ? state_SPEECH : state_SILENCE ;
}

이제 가장 중요 한 실현 만 남 았 습 니 다. 바로 energydetect, 에너지 기반 점 검 측, 여 기 는 track energy 방법 을 사용 합 니 다.

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

다양한 언어의 JSON

JSON은 Javascript 표기법을 사용하여 데이터 구조를 레이아웃하는 데이터 형식입니다. 그러나 Javascript가 코드에서 이러한 구조를 나타낼 수 있는 유일한 언어는 아닙니다. 저는 일반적으로 '객체'{}...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

VAD 구현 (3) - 알고리즘 계산 프로 세 스 와 프레임 워 크

좋은 웹페이지 즐겨찾기