java는 Nagao 알고리즘을 사용하여 새로운 단어 발견, 인기 단어 발굴을 실현한다
15459 단어 javaNagao 알고리즘
설명:
Nagao 알고리즘: 모든 하위 문자열의 빈도를 신속하게 통계하는 알고리즘입니다.상세한 알고리즘을 볼 수 있다http://www.doc88.com/p-664123446503.html
단어 주파수: 이 문자열이 문서에 나타나는 횟수입니다.출현 횟수가 많을수록 중요하다.
좌우 인접 개수: 문서에서 이 문자열의 왼쪽과 오른쪽에 나타나는 다른 글자의 개수입니다.좌우 인접이 많을수록 문자열이 단어가 될 확률이 높다는 것을 설명한다.
좌우 엔트로피: 문서에서 이 문자열의 왼쪽과 오른쪽에 나타나는 서로 다른 글자의 수량 분포 엔트로피.위의 지표와 유사하게 일정한 차이가 있다.
상호작용 정보: 매번 어떤 문자열을 두 부분, 왼쪽 부분의 문자열과 오른쪽 부분의 문자열로 나누어 동시에 나타나는 확률을 계산하고 각각의 독립적으로 나타나는 확률을 제외하고 마지막으로 모든 구분 안의 확률의 최소값을 취한다.이 값이 클수록 문자열 내부의 응집도가 높고 단어가 될 수 있음을 나타낸다.
알고리즘 세부 절차:
1. 입력 파일을 한 줄 한 줄 읽어 비한자([^\u4E00-\u9FA5]+) 및 정지어에 따라 "아주 좋습니까. 글쎄요. 하나도 안 쓰는 것보다 낫습니다. 버티고 나서 말하지 않았습니다."
다음과 같은 문자열로 나뉩니다.
String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
사용 정지어는 수정할 수 있다.
2. 분할된 모든 문자열의 왼쪽 문자열과 오른쪽 문자열을 가져와 각각 왼쪽, 오른쪽 Ptable을 넣는다
3. Ptable을 정렬하고 LTable을 계산합니다.LTable은 정렬된 Ptable의 다음 하위 문자열과 이전 하위 문자열의 숫자를 기록합니다.
4. Ptable와 LTable을 반복하면 모든 하위 문자열의 단어 주파수, 좌우 인접
5. 모든 하위 문자열의 주파수, 좌우 인접 결과에 따라 문자열의 주파수, 좌우 인접 개수, 좌우 엔트로피, 상호작용 정보를 출력한다
1. NagaoAlgorithm.java
package com.algo.word;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class NagaoAlgorithm {
private int N;
private List<String> leftPTable;
private int[] leftLTable;
private List<String> rightPTable;
private int[] rightLTable;
private double wordNumber;
private Map<String, TFNeighbor> wordTFNeighbor;
private final static String stopwords = " ";
private NagaoAlgorithm(){
//default N = 5
N = 5;
leftPTable = new ArrayList<String>();
rightPTable = new ArrayList<String>();
wordTFNeighbor = new HashMap<String, TFNeighbor>();
}
//reverse phrase
private String reverse(String phrase) {
StringBuilder reversePhrase = new StringBuilder();
for (int i = phrase.length() - 1; i >= 0; i--)
reversePhrase.append(phrase.charAt(i));
return reversePhrase.toString();
}
//co-prefix length of s1 and s2
private int coPrefixLength(String s1, String s2){
int coPrefixLength = 0;
for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){
if(s1.charAt(i) == s2.charAt(i)) coPrefixLength++;
else break;
}
return coPrefixLength;
}
//add substring of line to pTable
private void addToPTable(String line){
//split line according to consecutive none Chinese character
String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
for(String phrase : phrases){
for(int i = 0; i < phrase.length(); i++)
rightPTable.add(phrase.substring(i));
String reversePhrase = reverse(phrase);
for(int i = 0; i < reversePhrase.length(); i++)
leftPTable.add(reversePhrase.substring(i));
wordNumber += phrase.length();
}
}
//count lTable
private void countLTable(){
Collections.sort(rightPTable);
rightLTable = new int[rightPTable.size()];
for(int i = 1; i < rightPTable.size(); i++)
rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i));
Collections.sort(leftPTable);
leftLTable = new int[leftPTable.size()];
for(int i = 1; i < leftPTable.size(); i++)
leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i));
System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable");
}
//according to pTable and lTable, count statistical result: TF, neighbor distribution
private void countTFNeighbor(){
//get TF and right neighbor
for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){
String phrase = rightPTable.get(pIndex);
for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){
String word = phrase.substring(0, length);
TFNeighbor tfNeighbor = new TFNeighbor();
tfNeighbor.incrementTF();
if(phrase.length() > length)
tfNeighbor.addToRightNeighbor(phrase.charAt(length));
for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){
if(rightLTable[lIndex] >= length){
tfNeighbor.incrementTF();
String coPhrase = rightPTable.get(lIndex);
if(coPhrase.length() > length)
tfNeighbor.addToRightNeighbor(coPhrase.charAt(length));
}
else break;
}
wordTFNeighbor.put(word, tfNeighbor);
}
}
//get left neighbor
for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){
String phrase = leftPTable.get(pIndex);
for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){
String word = reverse(phrase.substring(0, length));
TFNeighbor tfNeighbor = wordTFNeighbor.get(word);
if(phrase.length() > length)
tfNeighbor.addToLeftNeighbor(phrase.charAt(length));
for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){
if(leftLTable[lIndex] >= length){
String coPhrase = leftPTable.get(lIndex);
if(coPhrase.length() > length)
tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length));
}
else break;
}
}
}
System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor");
}
//according to wordTFNeighbor, count MI of word
private double countMI(String word){
if(word.length() <= 1) return 0;
double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber;
List<Double> mi = new ArrayList<Double>(word.length());
for(int pos = 1; pos < word.length(); pos++){
String leftPart = word.substring(0, pos);
String rightPart = word.substring(pos);
double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber;
double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber;
mi.add(coProbability/(leftProbability*rightProbability));
}
return Collections.min(mi);
}
//save TF, (left and right) neighbor number, neighbor entropy, mutual information
private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){
try {
//read stop words file
Set<String> stopWords = new HashSet<String>();
BufferedReader br = new BufferedReader(new FileReader(stopList));
String line;
while((line = br.readLine()) != null){
if(line.length() > 1)
stopWords.add(line);
}
br.close();
//output words TF, neighbor info, MI
BufferedWriter bw = new BufferedWriter(new FileWriter(out));
for(Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){
if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue;
TFNeighbor tfNeighbor = entry.getValue();
int tf, leftNeighborNumber, rightNeighborNumber;
double mi;
tf = tfNeighbor.getTF();
leftNeighborNumber = tfNeighbor.getLeftNeighborNumber();
rightNeighborNumber = tfNeighbor.getRightNeighborNumber();
mi = countMI(entry.getKey());
if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) &&
rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){
StringBuilder sb = new StringBuilder();
sb.append(entry.getKey());
sb.append(",").append(tf);
sb.append(",").append(leftNeighborNumber);
sb.append(",").append(rightNeighborNumber);
sb.append(",").append(tfNeighbor.getLeftNeighborEntropy());
sb.append(",").append(tfNeighbor.getRightNeighborEntropy());
sb.append(",").append(mi).append("
");
bw.write(sb.toString());
}
}
bw.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file");
}
//apply nagao algorithm to input file
public static void applyNagao(String[] inputs, String out, String stopList){
NagaoAlgorithm nagao = new NagaoAlgorithm();
//step 1: add phrases to PTable
String line;
for(String in : inputs){
try {
BufferedReader br = new BufferedReader(new FileReader(in));
while((line = br.readLine()) != null){
nagao.addToPTable(line);
}
br.close();
} catch (IOException e) {
throw new RuntimeException();
}
}
System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");
//step 2: sort PTable and count LTable
nagao.countLTable();
//step3: count TF and Neighbor
nagao.countTFNeighbor();
//step4: save TF NeighborInfo and MI
nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(","));
}
public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){
NagaoAlgorithm nagao = new NagaoAlgorithm();
nagao.setN(n);
String[] threshold = filter.split(",");
if(threshold.length != 4){
System.out.println("ERROR: filter must have 4 numbers, seperated with ',' ");
return;
}
//step 1: add phrases to PTable
String line;
for(String in : inputs){
try {
BufferedReader br = new BufferedReader(new FileReader(in));
while((line = br.readLine()) != null){
nagao.addToPTable(line);
}
br.close();
} catch (IOException e) {
throw new RuntimeException();
}
}
System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable");
//step 2: sort PTable and count LTable
nagao.countLTable();
//step3: count TF and Neighbor
nagao.countTFNeighbor();
//step4: save TF NeighborInfo and MI
nagao.saveTFNeighborInfoMI(out, stopList, threshold);
}
private void setN(int n){
N = n;
}
public static void main(String[] args) {
String[] ins = {"E://test//ganfen.txt"};
applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt");
}
}
2. TFNeighbor.java
package com.algo.word;
import java.util.HashMap;
import java.util.Map;
public class TFNeighbor {
private int tf;
private Map<Character, Integer> leftNeighbor;
private Map<Character, Integer> rightNeighbor;
TFNeighbor(){
leftNeighbor = new HashMap<Character, Integer>();
rightNeighbor = new HashMap<Character, Integer>();
}
//add word to leftNeighbor
public void addToLeftNeighbor(char word){
//leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0));
Integer number = leftNeighbor.get(word);
leftNeighbor.put(word, number == null? 1: 1+number);
}
//add word to rightNeighbor
public void addToRightNeighbor(char word){
//rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0));
Integer number = rightNeighbor.get(word);
rightNeighbor.put(word, number == null? 1: 1+number);
}
//increment tf
public void incrementTF(){
tf++;
}
public int getLeftNeighborNumber(){
return leftNeighbor.size();
}
public int getRightNeighborNumber(){
return rightNeighbor.size();
}
public double getLeftNeighborEntropy(){
double entropy = 0;
int sum = 0;
for(int number : leftNeighbor.values()){
entropy += number*Math.log(number);
sum += number;
}
if(sum == 0) return 0;
return Math.log(sum) - entropy/sum;
}
public double getRightNeighborEntropy(){
double entropy = 0;
int sum = 0;
for(int number : rightNeighbor.values()){
entropy += number*Math.log(number);
sum += number;
}
if(sum == 0) return 0;
return Math.log(sum) - entropy/sum;
}
public int getTF(){
return tf;
}
}
3. Main.java
package com.algo.word;
public class Main {
public static void main(String[] args) {
//if 3 arguments, first argument is input files splitting with ','
//second argument is output file
//output 7 columns split with ',' , like below:
//word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information
//third argument is stop words list
if(args.length == 3)
NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]);
//if 4 arguments, forth argument is the NGram parameter N
//5th argument is threshold of output words, default is "20,3,3,5"
//output TF > 20 && (left | right) neighbor number > 3 && MI > 5
else if(args.length == 5)
NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]);
}
}
위에서 말한 것이 바로 본문의 전체 내용입니다. 여러분이 좋아하시기 바랍니다.
이 내용에 흥미가 있습니까?
현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:
38. Java의 Leetcode 솔루션텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.
CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.