java는 Nagao 알고리즘을 사용하여 새로운 단어 발견, 인기 단어 발굴을 실현한다

15459 단어 javaNagao 알고리즘
Nagao 알고리즘을 이용하여 각 하위 문자열의 빈도를 통계한 다음에 이러한 빈도를 바탕으로 각 문자열의 단어 빈도, 좌우 인접 개수, 좌우 엔트로피, 상호작용 정보(내부 응집도)를 통계한다.
설명:
Nagao 알고리즘: 모든 하위 문자열의 빈도를 신속하게 통계하는 알고리즘입니다.상세한 알고리즘을 볼 수 있다http://www.doc88.com/p-664123446503.html
단어 주파수: 이 문자열이 문서에 나타나는 횟수입니다.출현 횟수가 많을수록 중요하다.
좌우 인접 개수: 문서에서 이 문자열의 왼쪽과 오른쪽에 나타나는 다른 글자의 개수입니다.좌우 인접이 많을수록 문자열이 단어가 될 확률이 높다는 것을 설명한다.
좌우 엔트로피: 문서에서 이 문자열의 왼쪽과 오른쪽에 나타나는 서로 다른 글자의 수량 분포 엔트로피.위의 지표와 유사하게 일정한 차이가 있다.
상호작용 정보: 매번 어떤 문자열을 두 부분, 왼쪽 부분의 문자열과 오른쪽 부분의 문자열로 나누어 동시에 나타나는 확률을 계산하고 각각의 독립적으로 나타나는 확률을 제외하고 마지막으로 모든 구분 안의 확률의 최소값을 취한다.이 값이 클수록 문자열 내부의 응집도가 높고 단어가 될 수 있음을 나타낸다.
알고리즘 세부 절차:
1. 입력 파일을 한 줄 한 줄 읽어 비한자([^\u4E00-\u9FA5]+) 및 정지어에 따라 "아주 좋습니까. 글쎄요. 하나도 안 쓰는 것보다 낫습니다. 버티고 나서 말하지 않았습니다."
다음과 같은 문자열로 나뉩니다.
String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
사용 정지어는 수정할 수 있다.
2. 분할된 모든 문자열의 왼쪽 문자열과 오른쪽 문자열을 가져와 각각 왼쪽, 오른쪽 Ptable을 넣는다
3. Ptable을 정렬하고 LTable을 계산합니다.LTable은 정렬된 Ptable의 다음 하위 문자열과 이전 하위 문자열의 숫자를 기록합니다.
4. Ptable와 LTable을 반복하면 모든 하위 문자열의 단어 주파수, 좌우 인접
5. 모든 하위 문자열의 주파수, 좌우 인접 결과에 따라 문자열의 주파수, 좌우 인접 개수, 좌우 엔트로피, 상호작용 정보를 출력한다
1.  NagaoAlgorithm.java

package com.algo.word;
 
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
 
public class NagaoAlgorithm {
   
  private int N;
   
  private List<String> leftPTable;
  private int[] leftLTable;
  private List<String> rightPTable;
  private int[] rightLTable;
  private double wordNumber;
   
  private Map<String, TFNeighbor> wordTFNeighbor;
   
  private final static String stopwords = " ";
   
  private NagaoAlgorithm(){
    //default N = 5
    N = 5;
    leftPTable = new ArrayList<String>();
    rightPTable = new ArrayList<String>();
    wordTFNeighbor = new HashMap<String, TFNeighbor>();
  }
  //reverse phrase
  private String reverse(String phrase) {
    StringBuilder reversePhrase = new StringBuilder();
    for (int i = phrase.length() - 1; i >= 0; i--)
      reversePhrase.append(phrase.charAt(i));
    return reversePhrase.toString();
  }
  //co-prefix length of s1 and s2
  private int coPrefixLength(String s1, String s2){
    int coPrefixLength = 0;
    for(int i = 0; i < Math.min(s1.length(), s2.length()); i++){
      if(s1.charAt(i) == s2.charAt(i))  coPrefixLength++;
      else break;
    }
    return coPrefixLength;
  }
  //add substring of line to pTable
  private void addToPTable(String line){
    //split line according to consecutive none Chinese character
    String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
    for(String phrase : phrases){
      for(int i = 0; i < phrase.length(); i++)
        rightPTable.add(phrase.substring(i));
      String reversePhrase = reverse(phrase);
      for(int i = 0; i < reversePhrase.length(); i++)
        leftPTable.add(reversePhrase.substring(i));
      wordNumber += phrase.length();
    }
  }
   
  //count lTable
  private void countLTable(){
    Collections.sort(rightPTable);
    rightLTable = new int[rightPTable.size()];
    for(int i = 1; i < rightPTable.size(); i++)
      rightLTable[i] = coPrefixLength(rightPTable.get(i-1), rightPTable.get(i));
     
    Collections.sort(leftPTable);
    leftLTable = new int[leftPTable.size()];
    for(int i = 1; i < leftPTable.size(); i++)
      leftLTable[i] = coPrefixLength(leftPTable.get(i-1), leftPTable.get(i));
     
    System.out.println("Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable");
  }
  //according to pTable and lTable, count statistical result: TF, neighbor distribution
  private void countTFNeighbor(){
    //get TF and right neighbor
    for(int pIndex = 0; pIndex < rightPTable.size(); pIndex++){
      String phrase = rightPTable.get(pIndex);
      for(int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){
        String word = phrase.substring(0, length);
        TFNeighbor tfNeighbor = new TFNeighbor();
        tfNeighbor.incrementTF();
        if(phrase.length() > length)
          tfNeighbor.addToRightNeighbor(phrase.charAt(length));
        for(int lIndex = pIndex+1; lIndex < rightLTable.length; lIndex++){
          if(rightLTable[lIndex] >= length){
            tfNeighbor.incrementTF();
            String coPhrase = rightPTable.get(lIndex);
            if(coPhrase.length() > length)
              tfNeighbor.addToRightNeighbor(coPhrase.charAt(length));
          }
          else break;
        }
        wordTFNeighbor.put(word, tfNeighbor);
      }
    }
    //get left neighbor
    for(int pIndex = 0; pIndex < leftPTable.size(); pIndex++){
      String phrase = leftPTable.get(pIndex);
      for(int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){
        String word = reverse(phrase.substring(0, length));
        TFNeighbor tfNeighbor = wordTFNeighbor.get(word);
        if(phrase.length() > length)
          tfNeighbor.addToLeftNeighbor(phrase.charAt(length));
        for(int lIndex = pIndex + 1; lIndex < leftLTable.length; lIndex++){
          if(leftLTable[lIndex] >= length){
            String coPhrase = leftPTable.get(lIndex);
            if(coPhrase.length() > length)
              tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length));
          }
          else break;
        }
      }
    }
    System.out.println("Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor");
  }
  //according to wordTFNeighbor, count MI of word
  private double countMI(String word){
    if(word.length() <= 1)  return 0;
    double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber;
    List<Double> mi = new ArrayList<Double>(word.length());
    for(int pos = 1; pos < word.length(); pos++){
      String leftPart = word.substring(0, pos);
      String rightPart = word.substring(pos);
      double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber;
      double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber;
      mi.add(coProbability/(leftProbability*rightProbability));
    }
    return Collections.min(mi);
  }
  //save TF, (left and right) neighbor number, neighbor entropy, mutual information
  private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){
    try {
      //read stop words file
      Set<String> stopWords = new HashSet<String>();
      BufferedReader br = new BufferedReader(new FileReader(stopList));
      String line;
      while((line = br.readLine()) != null){
        if(line.length() > 1)
          stopWords.add(line);
      }
      br.close();
      //output words TF, neighbor info, MI
      BufferedWriter bw = new BufferedWriter(new FileWriter(out));
      for(Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){
        if( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue;
        TFNeighbor tfNeighbor = entry.getValue();
         
         
        int tf, leftNeighborNumber, rightNeighborNumber;
        double mi;
        tf = tfNeighbor.getTF();
        leftNeighborNumber = tfNeighbor.getLeftNeighborNumber();
        rightNeighborNumber = tfNeighbor.getRightNeighborNumber();
        mi = countMI(entry.getKey());
        if(tf > Integer.parseInt(threshold[0]) && leftNeighborNumber > Integer.parseInt(threshold[1]) && 
            rightNeighborNumber > Integer.parseInt(threshold[2]) && mi > Integer.parseInt(threshold[3]) ){
          StringBuilder sb = new StringBuilder();
          sb.append(entry.getKey());
          sb.append(",").append(tf);
          sb.append(",").append(leftNeighborNumber);
          sb.append(",").append(rightNeighborNumber);
          sb.append(",").append(tfNeighbor.getLeftNeighborEntropy());
          sb.append(",").append(tfNeighbor.getRightNeighborEntropy());
          sb.append(",").append(mi).append("
"); bw.write(sb.toString()); } } bw.close(); } catch (IOException e) { throw new RuntimeException(e); } System.out.println("Info: [Nagao Algorithm Step 4]: having saved to file"); } //apply nagao algorithm to input file public static void applyNagao(String[] inputs, String out, String stopList){ NagaoAlgorithm nagao = new NagaoAlgorithm(); //step 1: add phrases to PTable String line; for(String in : inputs){ try { BufferedReader br = new BufferedReader(new FileReader(in)); while((line = br.readLine()) != null){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable"); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5".split(",")); } public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){ NagaoAlgorithm nagao = new NagaoAlgorithm(); nagao.setN(n); String[] threshold = filter.split(","); if(threshold.length != 4){ System.out.println("ERROR: filter must have 4 numbers, seperated with ',' "); return; } //step 1: add phrases to PTable String line; for(String in : inputs){ try { BufferedReader br = new BufferedReader(new FileReader(in)); while((line = br.readLine()) != null){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println("Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable"); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, threshold); } private void setN(int n){ N = n; } public static void main(String[] args) { String[] ins = {"E://test//ganfen.txt"}; applyNagao(ins, "E://test//out.txt", "E://test//stoplist.txt"); } }
2. TFNeighbor.java

package com.algo.word;
 
import java.util.HashMap;
import java.util.Map;
 
public class TFNeighbor {
 
  private int tf;
  private Map<Character, Integer> leftNeighbor;
  private Map<Character, Integer> rightNeighbor;
   
  TFNeighbor(){
    leftNeighbor = new HashMap<Character, Integer>();
    rightNeighbor = new HashMap<Character, Integer>();
  }
  //add word to leftNeighbor
  public void addToLeftNeighbor(char word){
    //leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0));
    Integer number = leftNeighbor.get(word);
    leftNeighbor.put(word, number == null? 1: 1+number);
  }
  //add word to rightNeighbor
  public void addToRightNeighbor(char word){
    //rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0));
    Integer number = rightNeighbor.get(word);
    rightNeighbor.put(word, number == null? 1: 1+number);
  }
  //increment tf
  public void incrementTF(){
    tf++;
  }
  public int getLeftNeighborNumber(){
    return leftNeighbor.size();
  }
  public int getRightNeighborNumber(){
    return rightNeighbor.size();
  }
  public double getLeftNeighborEntropy(){
    double entropy = 0;
    int sum = 0;
    for(int number : leftNeighbor.values()){
      entropy += number*Math.log(number);
      sum += number;
    }
    if(sum == 0)  return 0;
    return Math.log(sum) - entropy/sum;
  }
  public double getRightNeighborEntropy(){
    double entropy = 0;
    int sum = 0;
    for(int number : rightNeighbor.values()){
      entropy += number*Math.log(number);
      sum += number;
    }
    if(sum == 0)  return 0;
    return Math.log(sum) - entropy/sum;
  }
  public int getTF(){
    return tf;
  }
}

3. Main.java

package com.algo.word;
 
public class Main {
 
  public static void main(String[] args) {
     
    //if 3 arguments, first argument is input files splitting with ','
    //second argument is output file
    //output 7 columns split with ',' , like below:
    //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information
    //third argument is stop words list
    if(args.length == 3)
      NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2]);
     
    //if 4 arguments, forth argument is the NGram parameter N
    //5th argument is threshold of output words, default is "20,3,3,5"
    //output TF > 20 && (left | right) neighbor number > 3 && MI > 5
    else if(args.length == 5)
      NagaoAlgorithm.applyNagao(args[0].split(","), args[1], args[2], Integer.parseInt(args[3]), args[4]);
     
     
  }
 
}
위에서 말한 것이 바로 본문의 전체 내용입니다. 여러분이 좋아하시기 바랍니다.

좋은 웹페이지 즐겨찾기