Hadoop 파 티 셔 너 구성 요소

14634 단어 hadoop 빅 데이터

1. Partitioner 구성 요 소 는 맵 이 Key 에 대해 구역 을 나 눌 수 있 고 키 에 따라 서로 다른 reduce 에 나 누 어 처리 할 수 있 습 니 다.2. 데이터 파일 이 서로 다른 성 을 포함 하 는 등 key 의 배포 규칙 을 사용자 정의 할 수 있 습 니 다. 출력 요 구 는 각 성 마다 하나의 파일 을 출력 하 는 것 입 니 다. 3. 기본 적 인 HashPartitioner 가 org. apache. hadop. mapreduce. lib. partition. HashPartitioner. 자바 에 있 습 니 다.

package org.apache.hadoop.mapreduce.lib.partition;

import org.apache.hadoop.mapreduce.Partitioner;

/** Partition keys by their {@link Object#hashCode()}. */
public class HashPartitioner<K, V> extends Partitioner<K, V> {

  /** Use {@link Object#hashCode()} to partition. */
  public int getPartition(K key, V value,
                          int numReduceTasks) {
    return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
  }

}

4. 사용자 정의 Partitioner 1) 추상 류 Partitioner 를 계승 하여 사용자 정의 getPartition () 방법 2) job. setPartitionerClass () 를 통 해 사용자 정의 Partitioner 를 org. apache. hadop. mapreduce. Partitioner. java 에 설정 합 니 다.

package org.apache.hadoop.mapreduce;

/** 
 * Partitions the key space.
 * 
 * Partitioner controls the partitioning of the keys of the 
 * intermediate map-outputs. The key (or a subset of the key) is used to derive
 * the partition, typically by a hash function. The total number of partitions
 * is the same as the number of reduce tasks for the job. Hence this controls
 * which of the m reduce tasks the intermediate key (and hence the 
 * record) is sent for reduction.
 * 
 * @see Reducer
 */
public abstract class Partitioner {

  /** 
   * Get the partition number for a given key (hence record) given the total 
   * number of partitions i.e. number of reduce-tasks for the job.
   *   
   * Typically a hash function on a all or a subset of the key.
   *
   * @param key the key to be partioned.
   * @param value the entry value.
   * @param numPartitions the total number of partitions.
   * @return the partition number for the key.
   */
  public abstract int getPartition(KEY key, VALUE value, int numPartitions);

}

Partitioner 예 Partitioner 응용 상황: 수요: 각 상품 의 주간 판매 상황 site 1 의 주간 판매 목록 을 각각 통계: shoes 20 hat 10 stockings 30 clothes 40
사이트 2 의 주간 판매 목록: shoes 15 hat 1 스타킹 90 clothes 80
집계 결과: shoes 35 hat 11 스타킹 120 옷 120
코드 는 다음 과 같 습 니 다: MyMapper. java

package com.partitioner;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

    @Override
    protected void map(LongWritable key, Text value,Context context)
            throws IOException, InterruptedException {
        String[] s = value.toString().split("\\s+") ;
        context.write(new Text(s[0]), new IntWritable(Integer.parseInt(s[1]))) ;
    }

}

MyPartitioner.java

package com.partitioner;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

public class MyPartitioner extends Partitioner<Text,IntWritable>{

    @Override
    public int getPartition(Text key, IntWritable value, int numPartitions) {
        if(key.toString().equals("shoes")){
            return 0 ;
        }

        if(key.toString().equals("hat")){
            return 1 ;
        }

        if(key.toString().equals("stockings")){
            return 2 ;
        }

        return 3 ;      
    }

}

MyReducer.java

package com.partitioner;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    @Override
    protected void reduce(Text key, Iterable value,Context context)
            throws IOException, InterruptedException {
        int sum = 0 ;
        for(IntWritable val : value ){
            sum += val.get() ;
        }
        context.write(key, new IntWritable(sum)) ;
    }

}

TestPartitioner.java

package com.partitioner;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.reduce.IntSumReducer;
import org.apache.hadoop.util.GenericOptionsParser;

public class TestPartitioner {
    public static void main(String args[])throws Exception{
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
          System.err.println("Usage: wordcount  ");
          System.exit(2);
        }

        Job job = new Job(conf, "word count");
        job.setJarByClass(TestPartitioner.class);
        job.setMapperClass(MyMapper.class);
//      job.setCombinerClass(MyCombiner.class);
        job.setReducerClass(MyReducer.class);
        job.setPartitionerClass(MyPartitioner.class) ;
        job.setNumReduceTasks(4) ;

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);


        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

Azure HDInsight + Microsoft R Server에서 연산 처리 분산

Microsoft Azure HDInsight는 Microsoft가 제공하는 Hadoop의 PaaS 서비스로 인프라 주변의 구축 노하우를 몰라도 훌륭한 Hadoop 클러스터를 구축할 수 있는 훌륭한 서비스입니다. 이...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

FastAPI가 포함된 Python REST API, CRUD 애플리케이션

Fluke 설치

좋은 웹페이지 즐겨찾기

개발자 우수 사이트 수집

개발자가 알아야 할 필수 사이트 100선 추천 우리는 당신을 위해 100개의 자주 사용하는 개발자 학습 사이트를 정리했습니다